Skip to content

Instantly share code, notes, and snippets.

@patrickbrus
Created January 10, 2021 17:36
Show Gist options
  • Save patrickbrus/9ea4a618e97f6823aaf6167ef7f3a01a to your computer and use it in GitHub Desktop.
Save patrickbrus/9ea4a618e97f6823aaf6167ef7f3a01a to your computer and use it in GitHub Desktop.
Function for imputing missing data
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# split into train and test sets
# data_df_converted is the dataframe containing all features
df_train, df_test = train_test_split(data_df_converted, test_size=0.10, random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.3, random_state=42)
# create the iterative imputer model
imputer = IterativeImputer(max_iter=20, random_state=42, verbose=1)
# fit the imputer only on training data in order to avoid data leakage into the test data
imputer = imputer.fit(df_train[["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]])
# transform the MarkDown values
df_train_imputed = imputer.transform(df_train[["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]])
df_test_imputed = imputer.transform(df_test[["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]])
df_val_imputed = imputer.transform(df_val[["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]])
def add_new_markdown_cols(df_orig, data_df_imputed):
df_new = df_orig.copy()
df_new["MarkDown1"] = data_df_imputed[:, 0]
df_new["MarkDown2"] = data_df_imputed[:, 1]
df_new["MarkDown3"] = data_df_imputed[:, 2]
df_new["MarkDown4"] = data_df_imputed[:, 3]
df_new["MarkDown5"] = data_df_imputed[:, 4]
return df_new
# add new MarkDown data to dataframe
df_train = add_new_markdown_cols(df_train, df_train_imputed)
df_val = add_new_markdown_cols(df_val, df_val_imputed)
df_test = add_new_markdown_cols(df_test, df_test_imputed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment