import joblib import os from sklearn.calibration import CalibratedClassifierCV from sklearn.model_selection import cross_val_score from ultraopt.hdl import layering_config from ultraopt import fmin from ultraopt.multi_fidelity import HyperBandIterGenerator from modeling.config import MODELS_DIRECTORY def train_model(x_train, y_train, get_pipeline_function, model_name, model, param_space, n_trials, cv_times, scoring): """ Trains a machine learning model, optimizes the hyperparameters, saves the serialized model into the MODELS_DIRECTORY, and saves the cross validation results as a csv into the DIAGNOSTICS_DIRECTORY. :param x_train: x_train dataframe :param y_train: y_train series :param get_pipeline_function: callable that takes model to produce a scikit-learn pipeline :param model_name: name of the model :param model: instantiated model :param param_space: the distribution of hyperparameters to search over :param n_trials: number of trial to search for optimal hyperparameters :param cv_times: number of times to cross validation :param scoring: scoring method used for cross validation :returns: scikit-learn pipeline """ print(f'training {model_name}...') pipeline = get_pipeline_function(model) def _evaluate(config): local_pipe = pipeline.set_params(**layering_config(config)) return 1 - float(cross_val_score(local_pipe, x_train, y_train, scoring=scoring, cv=cv_times, n_jobs=-1).mean()) hb = HyperBandIterGenerator(min_budget=1/4, max_budget=1, eta=2) result = fmin(eval_func=_evaluate, config_space=param_space, optimizer="ETPE", n_iterations=n_trials, multi_fidelity_iter_generator=hb) best_config = result.best_config pipeline.set_params(**best_config) pipeline.fit(x_train, y_train) joblib.dump(pipeline, os.path.join(model_name, MODELS_DIRECTORY, f'{model_name}.pkl'), compress=3) return pipeline