Created
January 10, 2021 17:36
-
-
Save patrickbrus/9ea4a618e97f6823aaf6167ef7f3a01a to your computer and use it in GitHub Desktop.
Function for imputing missing data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
from sklearn.experimental import enable_iterative_imputer | |
from sklearn.impute import IterativeImputer | |
# split into train and test sets | |
# data_df_converted is the dataframe containing all features | |
df_train, df_test = train_test_split(data_df_converted, test_size=0.10, random_state=42) | |
df_train, df_val = train_test_split(df_train, test_size=0.3, random_state=42) | |
# create the iterative imputer model | |
imputer = IterativeImputer(max_iter=20, random_state=42, verbose=1) | |
# fit the imputer only on training data in order to avoid data leakage into the test data | |
imputer = imputer.fit(df_train[["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]]) | |
# transform the MarkDown values | |
df_train_imputed = imputer.transform(df_train[["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]]) | |
df_test_imputed = imputer.transform(df_test[["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]]) | |
df_val_imputed = imputer.transform(df_val[["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]]) | |
def add_new_markdown_cols(df_orig, data_df_imputed): | |
df_new = df_orig.copy() | |
df_new["MarkDown1"] = data_df_imputed[:, 0] | |
df_new["MarkDown2"] = data_df_imputed[:, 1] | |
df_new["MarkDown3"] = data_df_imputed[:, 2] | |
df_new["MarkDown4"] = data_df_imputed[:, 3] | |
df_new["MarkDown5"] = data_df_imputed[:, 4] | |
return df_new | |
# add new MarkDown data to dataframe | |
df_train = add_new_markdown_cols(df_train, df_train_imputed) | |
df_val = add_new_markdown_cols(df_val, df_val_imputed) | |
df_test = add_new_markdown_cols(df_test, df_test_imputed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment