Skip to content

Instantly share code, notes, and snippets.

@pjoter
Last active March 22, 2020 09:51
Show Gist options
  • Save pjoter/9f897116322dbc8aa891032a0e1c5c00 to your computer and use it in GitHub Desktop.
Save pjoter/9f897116322dbc8aa891032a0e1c5c00 to your computer and use it in GitHub Desktop.
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTENC.html
# SMOTENC (SMOTE) for Pandas DataFrame
# - this codes uses SMOTENC (imbalanced-learn library) for oversampling imbalanced data
# - it preserves dataframe object, columns names and dtypes
from imblearn.over_sampling import SMOTENC
def col_ins(ds, var):
# column names to indices
return [ds.columns.get_loc(col) for col in var]
def smotenc(X, y, cat_var_ins):
sm = SMOTENC(random_state=42, categorical_features=cat_var_ins)
return sm.fit_sample(X, y)
def df_smotenc(df, dep_var, cat_var):
y = df[dep_var]
X = df.drop(dep_var, axis=1)
cat_var_ins = col_ins(X, cat_var)
# smotenc
X_res, y_res = smotenc(X, y, cat_var_ins)
# back to DataFrame (SMOTENC uses numpy)
X_res = pd.DataFrame(X_res, columns=X.columns)
y_res = pd.DataFrame(y_res, columns=[dep_var])
df_res = y_res.merge(X_res, left_index=True, right_index=True)
# set dtypes (which are lost when SMOTENC uses numpy)
df_res = df_res.astype((ds.dtypes))
return df_res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment