Created
October 18, 2018 21:58
-
-
Save eherrerosj/365de67e9c431ddc6ed2ea4ca42bdf3f to your computer and use it in GitHub Desktop.
New sklearn Pipeline for OHE and Imputing vs Pandas homonym
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Option 1: Pandas | |
import pandas as pd | |
def apply_pipeline(df, categorical_columns, numerical_columns): | |
''' | |
One Hot Encoding to categorical_columns | |
Fill missing values tonumerical_columns | |
''' | |
for cat_col in categorical_columns: | |
df = pd.concat([df, pd.get_dummies(df[cat_col], prefix=cat_col)],axis=1) | |
df.drop([cat_col],axis=1, inplace=True) | |
for num_col in numerical_columns: | |
df[num_col] = df[num_col].fillna(value=-999) | |
return df | |
# Option 2: sklearn 0.20. Careful, it doesn't manage memory as well as Pandas | |
import pandas as pd | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.impute import SimpleImputer | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import train_test_split | |
# do not use this function | |
def apply_pipeline(df, categorical_columns, numerical_columns): | |
transformers = [] | |
features = [] | |
if categorical_columns: | |
cat_si_step = ('cat_si', SimpleImputer(strategy='constant', | |
fill_value='MISSING')) | |
cat_ohe_step = ('cat_ohe', OneHotEncoder(sparse=False, | |
handle_unknown='ignore')) | |
cat_steps = [cat_si_step, cat_ohe_step] | |
cat_pipe = Pipeline(cat_steps) | |
transformers.append(('cat', cat_pipe, categorical_columns)) | |
print(1) | |
if numerical_columns: | |
num_si_step = ('num_si', SimpleImputer(strategy='constant', | |
fill_value=-999)) | |
num_ohe_step = ('num_ohe', OneHotEncoder(sparse=False, | |
handle_unknown='ignore')) | |
num_steps = [num_si_step, num_ohe_step] | |
num_pipe = Pipeline(num_steps) | |
transformers.append(('num', num_pipe, numerical_columns)) | |
print(2) | |
ct = ColumnTransformer(transformers=transformers) | |
df_transformed = ct.fit_transform(df) # memory problems in this line, computer freezes when large (>1M rows) data | |
print(3) | |
if categorical_columns: | |
cat_features = ct.named_transformers_['cat'].named_steps['cat_ohe'].get_feature_names() | |
#print('cat_features', cat_features) | |
features.extend(cat_features) | |
print(4) | |
if numerical_columns: | |
num_features = ct.named_transformers_['num'].named_steps['num_ohe'].get_feature_names() | |
#print('num_features', num_features) | |
features.extend(num_features) | |
print(5) | |
df = pd.concat([df, pd.DataFrame(df_transformed, columns=features, index=df.index)], axis=1) | |
print(6) | |
df.drop(columns=categorical_columns, inplace=True) | |
print(7) | |
return df | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment