Skip to content

Instantly share code, notes, and snippets.

@franc3000
Last active April 7, 2018 13:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save franc3000/f3b79c299b26bf628d060df7d070f728 to your computer and use it in GitHub Desktop.
Save franc3000/f3b79c299b26bf628d060df7d070f728 to your computer and use it in GitHub Desktop.
Feature engineering automation
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.base import TransformerMixin, BaseEstimator
"""
PyData Chicago
Franklin Sarkett
franklin.sarkett@gmail.com
Work Hard Once
Strategy and Automation applied to building machine learning models
"""
class DataFrameColumnExtractor(TransformerMixin, BaseEstimator):
"""
Returns a DataFrame, given a DataFrame
"""
def __init__(self, column):
self.column = column
def transform(self, df):
df_col = df[[self.column]]
# if all values are NaN, then replace with 0
for c in df_col.columns:
if df_col[c].isnull().all():
df_col[c] = df_col[c].fillna(0)
return df_col
def fit(self, *_):
return self
class DataFrameImputer(TransformerMixin, BaseEstimator):
"""
Impute missing values.
Columns of dtype object are imputed with the most frequent val in col.
Columns of other types are imputed with mean of column.
"""
def __init__(self):
self.fill = 0
def fit(self, df, y=None):
# if not df and not series, error
if not isinstance(df, pd.DataFrame) and not isinstance(df, pd.Series):
raise ValueError('var `df` type is not a DataFrame or Series, it is a {}'.format(type(df)))
self.fill = pd.Series([df[c].median(skipna=True) for c in df], index=df.columns)
return self
def transform(self, df, y=None):
return df.fillna(self.fill)
class StandardScalerLimitTransformer(TransformerMixin, BaseEstimator):
"""
Replaces extreme values with the min and max allowed values
"""
def __init__(self, min_value=-3, max_value=3):
self.min_value = min_value
self.max_value = max_value
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# logging.getLogger('SSLimitTransformer').info('transform')
X[X < self.min_value] = self.min_value
X[X > self.max_value] = self.max_value
return X
def build_transformed_dataset(df, y):
pipeline_sqft = Pipeline([
('Sqft', DataFrameColumnExtractor('SquareFootage')),
('df', DataFrameImputer()),
('scaler', StandardScaler()),
('minmaxlimit', StandardScalerLimitTransformer())
])
pipeline_tav = Pipeline([
('TaxAssessedValue', DataFrameColumnExtractor('TaxAssessedValue')),
('df', DataFrameImputer()),
('scaler', StandardScaler()),
('minmaxlimit', StandardScalerLimitTransformer())
])
# feature union
dffu = DataFrameFeatureUnion([
('sqft', pipeline_sqft),
('tav', pipeline_tav),
])
# calling fit_transform on each pipeline in the feature union
X1 = dffu.fit_transform(df)
# concat on axis=1, adding cols
df_out = pd.concat([df, X1], axis=1)
return df_out, y
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment