Created November 9, 2019 04:19
An example of a feature engineering + model pipeline I made
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
bike = pd.read_csv('bikeshare.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
def transform_time(X):
"""Convert Datetime objects to seconds for numerical/quantitative parsing"""
df = pd.DataFrame(X)
return df.apply(lambda x: pd.to_datetime(x).apply(lambda x: x.timestamp()))
def as_is(df):
"""Returns the dataframe without any modifications"""
return df
Handling Categorical Data - Cardinal / Ordinal
col_base = ColumnTransformer([('convert_time', FunctionTransformer(transform_time,
('ohe', OneHotEncoder(), ['usertype', 'gender']),
('as_is', FunctionTransformer(as_is, validate=True),
['start station latitude', 'start station longitude',
'end station latitude', 'end station longitude',
'birth year', 'bikeid'])],
base_pipe = Pipeline([('transform', col_base), ('model', LogisticRegression())]), y_train);
Feature Engineering
def get_time_features(vals):
"""Takes a date and returns day of week, month, hour and
whether it is a weekday/workhour"""
ser = vals.iloc[:,0]
ser = pd.to_datetime(ser)
dow = ser.dt.dayofweek
month = ser.dt.month
hour = ser.dt.hour
weekday = dow.apply(lambda x: 1 if x < 5 else 0)
workhour = hour.apply(lambda x: 1 if x in [8,17] else 0)
out = pd.concat([dow, month, hour, weekday], axis=1)
return out
def get_hav_distance(coords):
"""Takes a set of coordinates and returns the Haversine distance between the two points"""
coords = pd.DataFrame(coords)
s_lat = coords.iloc[:,0]
s_lng = coords.iloc[:,1]
e_lat = coords.iloc[:,2]
e_lng = coords.iloc[:,3]
# approximate radius of earth in km
R = 6373.0
s_lat = s_lat*np.pi/180.0
s_lng = np.deg2rad(s_lng)
e_lat = np.deg2rad(e_lat)
e_lng = np.deg2rad(e_lng)
d = np.sin((e_lat - s_lat)/2)**2
+ np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2
out = 2 * R * np.arcsin(np.sqrt(d))
return pd.DataFrame(out)
def get_age(vals):
"""Converts Birth Year to Age in 2019"""
out = 2019 - pd.DataFrame(vals)
return out
# Calculates Haversine Distance and Standardize
dist = Pipeline([('calc_dist', FunctionTransformer(get_hav_distance)),
('standardize', StandardScaler())])
# Perform Different Feature Engineering based on our rules
col = ColumnTransformer([('convert_date',
FunctionTransformer(get_time_features, validate=False), ['starttime']),
('ohe', OneHotEncoder(),
['usertype', 'gender']),
('distance', dist,
['start station latitude', 'start station longitude',
'end station latitude', 'end station longitude']),
('convert_age', FunctionTransformer(get_age),
['birth year'])])
Using PCA to reduce dimensionality and produce more generalizable model
reduce = Pipeline([('transform', col),
('pca', PCA(n_components = 'mle', svd_solver = 'full'))])
Using 3-fold cross validation to find best parameters for logistic regression
lg_param_grid = {'penalty' : ['l1', 'l2'],
'C' : np.logspace(-4, 4, 3),
'solver' : ['liblinear']}
kfold = StratifiedKFold(n_splits=3)
lr = GridSearchCV(LogisticRegression(),
param_grid = lg_param_grid,
scoring="accuracy", cv=kfold,
n_jobs=4, verbose=1), y_train)
clf_best = lr.best_estimator_
# Final wrapper for all transformations + best model
final_pipe = Pipeline([('reduce', reduce), ('model', clf_best)])
