franc3000/fe.py

## fe.py
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.base import TransformerMixin, BaseEstimator

"""
PyData Chicago

Franklin Sarkett
franklin.sarkett@gmail.com

Work Hard Once
Strategy and Automation applied to building machine learning models
"""


class DataFrameColumnExtractor(TransformerMixin, BaseEstimator):
    """
    Returns a DataFrame, given a DataFrame
    """
    def __init__(self, column):
        self.column = column

    def transform(self, df):
        df_col = df[[self.column]]

        # if all values are NaN, then replace with 0
        for c in df_col.columns:
            if df_col[c].isnull().all():
                df_col[c] = df_col[c].fillna(0)

        return df_col

    def fit(self, *_):
        return self


class DataFrameImputer(TransformerMixin, BaseEstimator):
    """
    Impute missing values.
    Columns of dtype object are imputed with the most frequent val in col.
    Columns of other types are imputed with mean of column.
    """
    def __init__(self):
        self.fill = 0

    def fit(self, df, y=None):
        # if not df and not series, error
        if not isinstance(df, pd.DataFrame) and not isinstance(df, pd.Series):
            raise ValueError('var `df` type is not a DataFrame or Series, it is a {}'.format(type(df)))
        self.fill = pd.Series([df[c].median(skipna=True) for c in df], index=df.columns)
        return self

    def transform(self, df, y=None):
        return df.fillna(self.fill)


class StandardScalerLimitTransformer(TransformerMixin, BaseEstimator):
    """
    Replaces extreme values with the min and max allowed values
    """
    def __init__(self, min_value=-3, max_value=3):
        self.min_value = min_value
        self.max_value = max_value

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # logging.getLogger('SSLimitTransformer').info('transform')
        X[X < self.min_value] = self.min_value
        X[X > self.max_value] = self.max_value
        return X


def build_transformed_dataset(df, y):

    pipeline_sqft = Pipeline([
        ('Sqft', DataFrameColumnExtractor('SquareFootage')),
        ('df', DataFrameImputer()),
        ('scaler', StandardScaler()),
        ('minmaxlimit', StandardScalerLimitTransformer())
    ])

    pipeline_tav = Pipeline([
        ('TaxAssessedValue', DataFrameColumnExtractor('TaxAssessedValue')),
        ('df', DataFrameImputer()),
        ('scaler', StandardScaler()),
        ('minmaxlimit', StandardScalerLimitTransformer())
    ])

    # feature union
    dffu = DataFrameFeatureUnion([
        ('sqft', pipeline_sqft),
        ('tav', pipeline_tav),
    ])

    # calling fit_transform on each pipeline in the feature union
    X1 = dffu.fit_transform(df)

    # concat on axis=1, adding cols
    df_out = pd.concat([df, X1], axis=1)

    return df_out, y
	import numpy as np
	import pandas as pd

	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler, Binarizer
	from sklearn.base import TransformerMixin, BaseEstimator

	"""
	PyData Chicago

	Franklin Sarkett
	franklin.sarkett@gmail.com

	Work Hard Once
	Strategy and Automation applied to building machine learning models
	"""


	class DataFrameColumnExtractor(TransformerMixin, BaseEstimator):
	"""
	Returns a DataFrame, given a DataFrame
	"""
	def __init__(self, column):
	self.column = column

	def transform(self, df):
	df_col = df[[self.column]]

	# if all values are NaN, then replace with 0
	for c in df_col.columns:
	if df_col[c].isnull().all():
	df_col[c] = df_col[c].fillna(0)

	return df_col

	def fit(self, *_):
	return self


	class DataFrameImputer(TransformerMixin, BaseEstimator):
	"""
	Impute missing values.
	Columns of dtype object are imputed with the most frequent val in col.
	Columns of other types are imputed with mean of column.
	"""
	def __init__(self):
	self.fill = 0

	def fit(self, df, y=None):
	# if not df and not series, error
	if not isinstance(df, pd.DataFrame) and not isinstance(df, pd.Series):
	raise ValueError('var `df` type is not a DataFrame or Series, it is a {}'.format(type(df)))
	self.fill = pd.Series([df[c].median(skipna=True) for c in df], index=df.columns)
	return self

	def transform(self, df, y=None):
	return df.fillna(self.fill)


	class StandardScalerLimitTransformer(TransformerMixin, BaseEstimator):
	"""
	Replaces extreme values with the min and max allowed values
	"""
	def __init__(self, min_value=-3, max_value=3):
	self.min_value = min_value
	self.max_value = max_value

	def fit(self, X, y=None):
	return self

	def transform(self, X, y=None):
	# logging.getLogger('SSLimitTransformer').info('transform')
	X[X < self.min_value] = self.min_value
	X[X > self.max_value] = self.max_value
	return X


	def build_transformed_dataset(df, y):

	pipeline_sqft = Pipeline([
	('Sqft', DataFrameColumnExtractor('SquareFootage')),
	('df', DataFrameImputer()),
	('scaler', StandardScaler()),
	('minmaxlimit', StandardScalerLimitTransformer())
	])

	pipeline_tav = Pipeline([
	('TaxAssessedValue', DataFrameColumnExtractor('TaxAssessedValue')),
	('df', DataFrameImputer()),
	('scaler', StandardScaler()),
	('minmaxlimit', StandardScalerLimitTransformer())
	])

	# feature union
	dffu = DataFrameFeatureUnion([
	('sqft', pipeline_sqft),
	('tav', pipeline_tav),
	])

	# calling fit_transform on each pipeline in the feature union
	X1 = dffu.fit_transform(df)

	# concat on axis=1, adding cols
	df_out = pd.concat([df, X1], axis=1)

	return df_out, y