Skip to content

Instantly share code, notes, and snippets.

@micahmelling
Created July 13, 2021 02:36
import joblib
import os
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_score
from ultraopt.hdl import layering_config
from ultraopt import fmin
from ultraopt.multi_fidelity import HyperBandIterGenerator
from modeling.config import MODELS_DIRECTORY
def train_model(x_train, y_train, get_pipeline_function, model_name, model, param_space, n_trials, cv_times, scoring):
"""
Trains a machine learning model, optimizes the hyperparameters, saves the serialized model into the
MODELS_DIRECTORY, and saves the cross validation results as a csv into the DIAGNOSTICS_DIRECTORY.
:param x_train: x_train dataframe
:param y_train: y_train series
:param get_pipeline_function: callable that takes model to produce a scikit-learn pipeline
:param model_name: name of the model
:param model: instantiated model
:param param_space: the distribution of hyperparameters to search over
:param n_trials: number of trial to search for optimal hyperparameters
:param cv_times: number of times to cross validation
:param scoring: scoring method used for cross validation
:returns: scikit-learn pipeline
"""
print(f'training {model_name}...')
pipeline = get_pipeline_function(model)
def _evaluate(config):
local_pipe = pipeline.set_params(**layering_config(config))
return 1 - float(cross_val_score(local_pipe, x_train, y_train, scoring=scoring, cv=cv_times, n_jobs=-1).mean())
hb = HyperBandIterGenerator(min_budget=1/4, max_budget=1, eta=2)
result = fmin(eval_func=_evaluate, config_space=param_space, optimizer="ETPE", n_iterations=n_trials,
multi_fidelity_iter_generator=hb)
best_config = result.best_config
pipeline.set_params(**best_config)
pipeline.fit(x_train, y_train)
joblib.dump(pipeline, os.path.join(model_name, MODELS_DIRECTORY, f'{model_name}.pkl'), compress=3)
return pipeline
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment