Created
January 7, 2019 17:33
-
-
Save DavidRdgz/5bbe2966b89be2a586a92cc4ca4de23f to your computer and use it in GitHub Desktop.
Xgboost model on the Prudential life insurance dataset from Kaggle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A simple example of decluttering the settings for pandas so | |
that when developing the model and testing it, the dataframe | |
is a little cleaner and more readable. | |
""" | |
def pandas_defaults(defaults, pd): | |
def decorator(f): | |
def wrapper(*args, **kwargs): | |
f(*args, **kwargs) | |
setattr(wrapper, "defaults", defaults) | |
if defaults == "development": | |
pd.set_option('display.max_colwidth', -1) | |
pd.set_option('display.max_columns', 20) | |
pd.set_option('display.width', 10000) | |
return wrapper | |
return decorator |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A template to load data from a csv and apply the xgboost model | |
We first load the Prudential Insurance data into a pandas datafram | |
and then pre-process the file showing how to remove outliers and relabel | |
categorical variables. Then, we construct features using scikit learns | |
Pipeline class. We then evaluate the fit of one model, to get an overall | |
sense of the accuracy, and then apply a more rigorous grid search over the | |
hyperparameters to identify the best model we can identify - using any | |
combination of the hyper-parameters we specify. | |
For example, after one run, we attained the following accuracy: | |
------------------------ | |
best CV score: 0.5899292691141799 | |
------------------------ | |
best parameters on grid: | |
------------------------ | |
max_depth: 7 | |
n_estimators: 150 | |
------------------------ | |
""" | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
from sklearn.preprocessing import (LabelEncoder, FunctionTransformer) | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import confusion_matrix, accuracy_score | |
from sklearn.model_selection import GridSearchCV | |
from xgboost import XGBClassifier | |
import annotations | |
import numpy as np | |
import pandas as pd | |
import sys | |
def load_data(path): | |
""" | |
Load csv file into pandas dataframe | |
:param path: path/to/csv | |
:return: dataframe | |
""" | |
return pd.read_csv(path) | |
def pre_process(df): | |
""" | |
Remove outliers and relabel categories | |
:param df: | |
:return: | |
""" | |
def percentile_col(col, ptile): | |
return df[col] < df[col].quantile(ptile) | |
return ( | |
df.copy() | |
# .loc[percentile_col('Medical_History_1', .999), :] | |
.assign(Product_Info_2=df['Product_Info_2'].apply(lambda x: 'P_Info_2_' + x))) | |
def feature_pipeline(df): | |
""" | |
Convert Dataframe into NumPy array of features | |
:param df: pandas daataframe | |
:return: (y, x) with y - numpy array, x - numpy array of arrays | |
""" | |
def numeric_col_select(adf): | |
return adf.select_dtypes(exclude=['object']).values | |
def categorical_col_select(adf): | |
return adf.select_dtypes(include=['object']).values | |
def categorical_str2fact(X): | |
fact_cols = [LabelEncoder().fit_transform(col) for col in X.T] | |
fact_mat = np.vstack(fact_cols) | |
return fact_mat.T | |
pipe = ( | |
Pipeline([ | |
('union', FeatureUnion([ | |
('numeric_cols', FunctionTransformer(numeric_col_select, validate=False)), | |
('categorical_cols', Pipeline([ | |
('categories_select', | |
FunctionTransformer(categorical_col_select, validate=False)), | |
('factorize', | |
FunctionTransformer(categorical_str2fact, validate=False)) | |
])) | |
])) | |
]) | |
) | |
Y = df['Response'].values.reshape(-1, ) | |
X = pipe.fit_transform(df.drop(['Response'], axis=1)) | |
return Y, X | |
def one_model(model, X, y, X_test, y_test): | |
""" | |
Train and test one pass of data split into train/test | |
:param model: model | |
:param X: train data | |
:param y: train outcome | |
:param X_test: test data | |
:param y_test: test outcome | |
:return: None | |
""" | |
model.fit(X, y) | |
y_pred = model.predict(X_test) | |
predictions = [value for value in y_pred] | |
print(confusion_matrix(y_test, predictions)) | |
accuracy = accuracy_score(y_test, predictions) | |
print("Accuracy: %.2f%%" % (accuracy * 100.0)) | |
def grid_search(model, X, y): | |
""" | |
Train and test multiple models through parameter search | |
with cross-validation | |
:param model: model | |
:param X: train data | |
:param y: train outcome | |
:return: None | |
""" | |
params = { | |
'max_depth': [3, 5, 7], | |
'n_estimators': [50, 100, 150] | |
} | |
xgb_grid = GridSearchCV( | |
model, | |
params, | |
cv=5, | |
scoring='accuracy', | |
n_jobs=2, | |
return_train_score=True) | |
xgb_grid.fit(X, y) | |
print('best CV score: {}'.format(xgb_grid.best_score_)) | |
print('\n------------------------') | |
print('best parameters on grid:') | |
print('------------------------') | |
for k, v in xgb_grid.best_params_.items(): | |
print('{0}: {1}'.format(k, v)) | |
print('------------------------') | |
@annotations.pandas_defaults("development", pd) | |
def main(): | |
[train, test] = sys.argv[1:3] | |
print("[INFO] loading dataset") | |
Y, X = feature_pipeline( | |
pre_process( | |
load_data(train) | |
) | |
) | |
print("[INFO] splitting dataset into train/test") | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, Y, test_size=0.2, random_state=50) | |
model = XGBClassifier() | |
print("[INFO] training one xgboost") | |
one_model(model, X_train, y_train, X_test, y_test) | |
print("[INFO] training xgboost with grid search and cv") | |
grid_search(model, X_train, y_train) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment