Skip to content

Instantly share code, notes, and snippets.

@eherrerosj
Created October 18, 2018 21:58
Show Gist options
  • Save eherrerosj/365de67e9c431ddc6ed2ea4ca42bdf3f to your computer and use it in GitHub Desktop.
Save eherrerosj/365de67e9c431ddc6ed2ea4ca42bdf3f to your computer and use it in GitHub Desktop.
New sklearn Pipeline for OHE and Imputing vs Pandas homonym
# Option 1: Pandas
import pandas as pd
def apply_pipeline(df, categorical_columns, numerical_columns):
'''
One Hot Encoding to categorical_columns
Fill missing values tonumerical_columns
'''
for cat_col in categorical_columns:
df = pd.concat([df, pd.get_dummies(df[cat_col], prefix=cat_col)],axis=1)
df.drop([cat_col],axis=1, inplace=True)
for num_col in numerical_columns:
df[num_col] = df[num_col].fillna(value=-999)
return df
# Option 2: sklearn 0.20. Careful, it doesn't manage memory as well as Pandas
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
# do not use this function
def apply_pipeline(df, categorical_columns, numerical_columns):
transformers = []
features = []
if categorical_columns:
cat_si_step = ('cat_si', SimpleImputer(strategy='constant',
fill_value='MISSING'))
cat_ohe_step = ('cat_ohe', OneHotEncoder(sparse=False,
handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)
transformers.append(('cat', cat_pipe, categorical_columns))
print(1)
if numerical_columns:
num_si_step = ('num_si', SimpleImputer(strategy='constant',
fill_value=-999))
num_ohe_step = ('num_ohe', OneHotEncoder(sparse=False,
handle_unknown='ignore'))
num_steps = [num_si_step, num_ohe_step]
num_pipe = Pipeline(num_steps)
transformers.append(('num', num_pipe, numerical_columns))
print(2)
ct = ColumnTransformer(transformers=transformers)
df_transformed = ct.fit_transform(df) # memory problems in this line, computer freezes when large (>1M rows) data
print(3)
if categorical_columns:
cat_features = ct.named_transformers_['cat'].named_steps['cat_ohe'].get_feature_names()
#print('cat_features', cat_features)
features.extend(cat_features)
print(4)
if numerical_columns:
num_features = ct.named_transformers_['num'].named_steps['num_ohe'].get_feature_names()
#print('num_features', num_features)
features.extend(num_features)
print(5)
df = pd.concat([df, pd.DataFrame(df_transformed, columns=features, index=df.index)], axis=1)
print(6)
df.drop(columns=categorical_columns, inplace=True)
print(7)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment