Created
September 1, 2021 13:54
-
-
Save micahmelling/a648ef045e53cade03bcceedd1a63673 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import joblib | |
import os | |
from sklearn.model_selection import cross_val_score | |
from hyperopt import fmin, tpe, Trials, space_eval | |
def train_model(x_train, y_train, get_pipeline_function, model_uid, model, param_space, iterations, cv_strategy, | |
cv_scoring, static_param_space): | |
""" | |
Trains a machine learning model, optimizes the hyperparameters, and saves the serialized model. | |
:param x_train: x_train dataframe | |
:param y_train: y_train series | |
:param get_pipeline_function: callable that takes model to produce a scikit-learn pipeline | |
:param model_uid: model uid | |
:param model: instantiated model | |
:param param_space: the distribution of hyperparameters to search over | |
:param iterations: number of trial to search for optimal hyperparameters | |
:param cv_strategy: cross validation strategy | |
:param cv_scoring: scoring method used for cross validation | |
:param static_param_space: parameter search space valid for all models (e.g. feature engineering) | |
:returns: scikit-learn pipeline | |
""" | |
print(f'training {model_uid}...') | |
pipeline = get_pipeline_function(model) | |
if static_param_space: | |
param_space.update(static_param_space) | |
cv_scores_df = pd.DataFrame() | |
def _model_objective(params): | |
pipeline.set_params(**params) | |
score = cross_val_score(pipeline, x_train, y_train, cv=cv_strategy, scoring=cv_scoring, n_jobs=-1) | |
temp_cv_scores_df = pd.DataFrame(score) | |
temp_cv_scores_df = temp_cv_scores_df.reset_index() | |
temp_cv_scores_df['index'] = 'fold_' + temp_cv_scores_df['index'].astype(str) | |
temp_cv_scores_df = temp_cv_scores_df.T | |
temp_cv_scores_df = temp_cv_scores_df.add_prefix('fold_') | |
temp_cv_scores_df = temp_cv_scores_df.iloc[1:] | |
temp_cv_scores_df['mean'] = temp_cv_scores_df.mean(axis=1) | |
temp_cv_scores_df['std'] = temp_cv_scores_df.std(axis=1) | |
temp_params_df = pd.DataFrame(params, index=list(range(0, len(params) + 1))) | |
temp_cv_scores_df = pd.concat([temp_params_df, temp_cv_scores_df], axis=1) | |
temp_cv_scores_df = temp_cv_scores_df.dropna() | |
nonlocal cv_scores_df | |
cv_scores_df = cv_scores_df.append(temp_cv_scores_df) | |
return 1 - score.mean() | |
trials = Trials() | |
best = fmin(_model_objective, param_space, algo=tpe.suggest, max_evals=iterations, trials=trials) | |
best_params = space_eval(param_space, best) | |
cv_scores_df = cv_scores_df.sort_values(by=['mean'], ascending=False) | |
cv_scores_df = cv_scores_df.reset_index(drop=True) | |
cv_scores_df = cv_scores_df.reset_index() | |
cv_scores_df = cv_scores_df.rename(columns={'index': 'ranking'}) | |
cv_scores_df.to_csv(os.path.join(model_uid, 'diagnostics', 'cv_scores', 'cv_scores.csv'), index=False) | |
pipeline.set_params(**best_params) | |
pipeline.fit(x_train, y_train) | |
joblib.dump(os.path.join(model_uid, 'models', 'model.pkl')) | |
return pipeline |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment