Skip to content

Instantly share code, notes, and snippets.

@DavidRdgz
Created January 7, 2019 17:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DavidRdgz/5bbe2966b89be2a586a92cc4ca4de23f to your computer and use it in GitHub Desktop.
Save DavidRdgz/5bbe2966b89be2a586a92cc4ca4de23f to your computer and use it in GitHub Desktop.
Xgboost model on the Prudential life insurance dataset from Kaggle
"""
A simple example of decluttering the settings for pandas so
that when developing the model and testing it, the dataframe
is a little cleaner and more readable.
"""
def pandas_defaults(defaults, pd):
def decorator(f):
def wrapper(*args, **kwargs):
f(*args, **kwargs)
setattr(wrapper, "defaults", defaults)
if defaults == "development":
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 10000)
return wrapper
return decorator
"""
A template to load data from a csv and apply the xgboost model
We first load the Prudential Insurance data into a pandas datafram
and then pre-process the file showing how to remove outliers and relabel
categorical variables. Then, we construct features using scikit learns
Pipeline class. We then evaluate the fit of one model, to get an overall
sense of the accuracy, and then apply a more rigorous grid search over the
hyperparameters to identify the best model we can identify - using any
combination of the hyper-parameters we specify.
For example, after one run, we attained the following accuracy:
------------------------
best CV score: 0.5899292691141799
------------------------
best parameters on grid:
------------------------
max_depth: 7
n_estimators: 150
------------------------
"""
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (LabelEncoder, FunctionTransformer)
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import annotations
import numpy as np
import pandas as pd
import sys
def load_data(path):
"""
Load csv file into pandas dataframe
:param path: path/to/csv
:return: dataframe
"""
return pd.read_csv(path)
def pre_process(df):
"""
Remove outliers and relabel categories
:param df:
:return:
"""
def percentile_col(col, ptile):
return df[col] < df[col].quantile(ptile)
return (
df.copy()
# .loc[percentile_col('Medical_History_1', .999), :]
.assign(Product_Info_2=df['Product_Info_2'].apply(lambda x: 'P_Info_2_' + x)))
def feature_pipeline(df):
"""
Convert Dataframe into NumPy array of features
:param df: pandas daataframe
:return: (y, x) with y - numpy array, x - numpy array of arrays
"""
def numeric_col_select(adf):
return adf.select_dtypes(exclude=['object']).values
def categorical_col_select(adf):
return adf.select_dtypes(include=['object']).values
def categorical_str2fact(X):
fact_cols = [LabelEncoder().fit_transform(col) for col in X.T]
fact_mat = np.vstack(fact_cols)
return fact_mat.T
pipe = (
Pipeline([
('union', FeatureUnion([
('numeric_cols', FunctionTransformer(numeric_col_select, validate=False)),
('categorical_cols', Pipeline([
('categories_select',
FunctionTransformer(categorical_col_select, validate=False)),
('factorize',
FunctionTransformer(categorical_str2fact, validate=False))
]))
]))
])
)
Y = df['Response'].values.reshape(-1, )
X = pipe.fit_transform(df.drop(['Response'], axis=1))
return Y, X
def one_model(model, X, y, X_test, y_test):
"""
Train and test one pass of data split into train/test
:param model: model
:param X: train data
:param y: train outcome
:param X_test: test data
:param y_test: test outcome
:return: None
"""
model.fit(X, y)
y_pred = model.predict(X_test)
predictions = [value for value in y_pred]
print(confusion_matrix(y_test, predictions))
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
def grid_search(model, X, y):
"""
Train and test multiple models through parameter search
with cross-validation
:param model: model
:param X: train data
:param y: train outcome
:return: None
"""
params = {
'max_depth': [3, 5, 7],
'n_estimators': [50, 100, 150]
}
xgb_grid = GridSearchCV(
model,
params,
cv=5,
scoring='accuracy',
n_jobs=2,
return_train_score=True)
xgb_grid.fit(X, y)
print('best CV score: {}'.format(xgb_grid.best_score_))
print('\n------------------------')
print('best parameters on grid:')
print('------------------------')
for k, v in xgb_grid.best_params_.items():
print('{0}: {1}'.format(k, v))
print('------------------------')
@annotations.pandas_defaults("development", pd)
def main():
[train, test] = sys.argv[1:3]
print("[INFO] loading dataset")
Y, X = feature_pipeline(
pre_process(
load_data(train)
)
)
print("[INFO] splitting dataset into train/test")
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=50)
model = XGBClassifier()
print("[INFO] training one xgboost")
one_model(model, X_train, y_train, X_test, y_test)
print("[INFO] training xgboost with grid search and cv")
grid_search(model, X_train, y_train)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment