Skip to content

Instantly share code, notes, and snippets.

@micahmelling
Created September 1, 2021 13:54
Show Gist options
  • Save micahmelling/a648ef045e53cade03bcceedd1a63673 to your computer and use it in GitHub Desktop.
Save micahmelling/a648ef045e53cade03bcceedd1a63673 to your computer and use it in GitHub Desktop.
import pandas as pd
import joblib
import os
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, Trials, space_eval
def train_model(x_train, y_train, get_pipeline_function, model_uid, model, param_space, iterations, cv_strategy,
cv_scoring, static_param_space):
"""
Trains a machine learning model, optimizes the hyperparameters, and saves the serialized model.
:param x_train: x_train dataframe
:param y_train: y_train series
:param get_pipeline_function: callable that takes model to produce a scikit-learn pipeline
:param model_uid: model uid
:param model: instantiated model
:param param_space: the distribution of hyperparameters to search over
:param iterations: number of trial to search for optimal hyperparameters
:param cv_strategy: cross validation strategy
:param cv_scoring: scoring method used for cross validation
:param static_param_space: parameter search space valid for all models (e.g. feature engineering)
:returns: scikit-learn pipeline
"""
print(f'training {model_uid}...')
pipeline = get_pipeline_function(model)
if static_param_space:
param_space.update(static_param_space)
cv_scores_df = pd.DataFrame()
def _model_objective(params):
pipeline.set_params(**params)
score = cross_val_score(pipeline, x_train, y_train, cv=cv_strategy, scoring=cv_scoring, n_jobs=-1)
temp_cv_scores_df = pd.DataFrame(score)
temp_cv_scores_df = temp_cv_scores_df.reset_index()
temp_cv_scores_df['index'] = 'fold_' + temp_cv_scores_df['index'].astype(str)
temp_cv_scores_df = temp_cv_scores_df.T
temp_cv_scores_df = temp_cv_scores_df.add_prefix('fold_')
temp_cv_scores_df = temp_cv_scores_df.iloc[1:]
temp_cv_scores_df['mean'] = temp_cv_scores_df.mean(axis=1)
temp_cv_scores_df['std'] = temp_cv_scores_df.std(axis=1)
temp_params_df = pd.DataFrame(params, index=list(range(0, len(params) + 1)))
temp_cv_scores_df = pd.concat([temp_params_df, temp_cv_scores_df], axis=1)
temp_cv_scores_df = temp_cv_scores_df.dropna()
nonlocal cv_scores_df
cv_scores_df = cv_scores_df.append(temp_cv_scores_df)
return 1 - score.mean()
trials = Trials()
best = fmin(_model_objective, param_space, algo=tpe.suggest, max_evals=iterations, trials=trials)
best_params = space_eval(param_space, best)
cv_scores_df = cv_scores_df.sort_values(by=['mean'], ascending=False)
cv_scores_df = cv_scores_df.reset_index(drop=True)
cv_scores_df = cv_scores_df.reset_index()
cv_scores_df = cv_scores_df.rename(columns={'index': 'ranking'})
cv_scores_df.to_csv(os.path.join(model_uid, 'diagnostics', 'cv_scores', 'cv_scores.csv'), index=False)
pipeline.set_params(**best_params)
pipeline.fit(x_train, y_train)
joblib.dump(os.path.join(model_uid, 'models', 'model.pkl'))
return pipeline
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment