Last active
March 7, 2022 05:32
-
-
Save yudhiesh/346367b6b75fc97f8ab0e15c11101ec7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import lightgbm as lgb | |
import optuna | |
from optuna.integration import LightGBMPruningCallback | |
from sklearn.metrics import log_loss, roc_auc_score | |
def get_features_labels(df, target_columns): | |
""" | |
Splits the dataframe into features and labels | |
returns: features and labels | |
""" | |
feature_columns = [columns for columns in df.columns.tolist() if columns != target_columns] | |
X = df.loc[:, feature_columns] | |
y = df.loc[:, target_columns] | |
return X, y | |
def get_train_valid(train, valid): | |
""" | |
Extracts the features and labels from the train and valid datasets | |
returns: the features and labels for train and valid | |
""" | |
X_train, y_train = get_features_labels(train) | |
X_valid, y_valid = get_features_labels(valid) | |
return (X_train, y_train, X_valid, y_valid) | |
def get_study_results(study, objective, name): | |
"""Get the results of an Optuna stufy based on the study name and the objective instance""" | |
print(f"Best trial - {name}:") | |
trial = study.best_trial | |
print(" Params: ") | |
for key, value in trial.params.items(): | |
print(" {}: {}".format(key, value)) | |
best_model = objective.best_booster | |
print(f"Best validation score: {best_model.best_score_}") | |
return best_model | |
def run_experiment(n_trials, train, valid, name): | |
""" | |
Runs an experiment for n_trials using the train & valid dataset | |
returns: the study and objective instance | |
""" | |
print(f"Running experiment for : {name.title()}") | |
objective = Objective(train=train, valid=valid) | |
study = optuna.create_study( | |
pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), | |
direction="maximize", | |
) | |
study.optimize(objective, n_trials=n_trials, callbacks=[objective.callback]) | |
return study, objective | |
class Objective: | |
""" | |
Objective class to perform hyperparameter tuning using Optuna and keep track of the best model | |
using the callback function. | |
Takes as input the train and valid dataset to be used and returns the roc_auc score per trial | |
which will be passed to an Optuna study | |
""" | |
def __init__(self, train, valid): | |
self.best_booster = None | |
self._booster = None | |
self.train = train | |
self.valid = valid | |
self.X_train, self.y_train, self.X_valid, self.y_valid = get_train_valid( | |
train=self.train, | |
valid=self.valid, | |
) | |
def __call__(self, trial): | |
param_grid = { | |
"n_estimators": trial.suggest_categorical("n_estimators", [10000]), | |
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3), | |
"num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20), | |
"max_depth": trial.suggest_int("max_depth", 3, 12), | |
"min_data_in_leaf": trial.suggest_int( | |
"min_data_in_leaf", 200, 10000, step=100 | |
), | |
"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5), | |
"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5), | |
"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15), | |
"bagging_fraction": trial.suggest_float( | |
"bagging_fraction", 0.2, 0.95, step=0.1 | |
), | |
"bagging_freq": trial.suggest_categorical("bagging_freq", [1]), | |
"feature_fraction": trial.suggest_float( | |
"feature_fraction", 0.2, 0.95, step=0.1 | |
), | |
} | |
model = lgb.LGBMClassifier(objective="binary", **param_grid) | |
model.fit( | |
self.X_train, | |
self.y_train, | |
eval_set=[(self.X_valid, self.y_valid)], | |
eval_metric=["auc", "binary_logloss"], | |
early_stopping_rounds=100, | |
callbacks=[LightGBMPruningCallback(trial, "auc")], | |
) | |
self._booster = model | |
preds = model.predict_proba(self.X_valid)[:, 1] | |
roc_auc = roc_auc_score(self.y_valid, preds) | |
return roc_auc | |
def callback(self, study, trial): | |
if study.best_trial == trial: | |
self.best_booster = self._booster | |
# Sample use case | |
# Pass in the train and validation data with the name of the experiment | |
study1, objective1 = run_experiment(n_trials=100, train=data1_train, valid=data1_valid, name="Data1") | |
# Get the results of the study and the best model from it | |
best_model1 = get_study_results(study=study1, objective=objective1, name="Data1") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment