Skip to content

Instantly share code, notes, and snippets.

@yudhiesh
Last active November 25, 2021 06:15
Show Gist options
  • Save yudhiesh/0a0c96955ab53ce65fbff7295e7b3055 to your computer and use it in GitHub Desktop.
Save yudhiesh/0a0c96955ab53ce65fbff7295e7b3055 to your computer and use it in GitHub Desktop.
import lightgbm as lgb
import mlflow
import optuna
from optuna.integration import LightGBMPruningCallback
from datetime import datetime
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
def get_train_valid(dataset):
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42
)
return X_train, X_test, y_train, y_test
def get_study_results(study, objective, name):
"""Get the results of an Optuna stufy based on the study name and the objective instance"""
print(f"Best trial - {name}:")
trial = study.best_trial
print(" Params: ")
for key, value in trial.params.items():
print(f" {key}: {value}")
best_model = objective.best_booster
print(f"Best validation score: {best_model.best_score_}")
return best_model
def run_experiment(n_trials, dataset, name):
"""
Runs an experiment for n_trials using the train & valid dataset
returns: the study and objective instance
"""
print(f"Running experiment for : {name.title()}")
objective = Objective(dataset=dataset)
study = optuna.create_study(
pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
direction="maximize",
)
study.optimize(objective, n_trials=n_trials, callbacks=[objective.callback])
return study, objective
class Objective:
"""
Objective class to perform hyperparameter tuning using Optuna and keep track of the best model
using the callback function.
Takes as input the train and valid dataset to be used and returns the roc_auc score per trial
which will be passed to an Optuna study
"""
def __init__(self, dataset):
self.best_booster = None
self._booster = None
self.dataset = dataset
def __call__(self, trial):
X_train, X_test, y_train, y_test = get_train_valid(self.dataset)
param_grid = {
"n_estimators": trial.suggest_categorical("n_estimators", [10000]),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
"num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
"max_depth": trial.suggest_int("max_depth", 3, 12),
"min_data_in_leaf": trial.suggest_int(
"min_data_in_leaf", 200, 10000, step=100
),
"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
"bagging_fraction": trial.suggest_float(
"bagging_fraction", 0.2, 0.95, step=0.1
),
"bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
"feature_fraction": trial.suggest_float(
"feature_fraction", 0.2, 0.95, step=0.1
),
}
run_name = f"Test_{datetime.now()}"
with mlflow.start_run(run_name=run_name):
model = lgb.LGBMClassifier(objective="binary", **param_grid)
model.fit(
X_train,
y_train,
eval_set=[(X_test, y_test)],
eval_metric=["auc", "binary_logloss"],
early_stopping_rounds=100,
callbacks=[LightGBMPruningCallback(trial, "auc")],
)
# log params from optuna trial
mlflow.log_params(trial.params)
self._booster = model
y_pred = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
log_loss_ = log_loss(y_test, y_pred)
# log metrics here
mlflow.log_metrics(
{
"log_loss": log_loss_,
"roc_auc": roc_auc,
},
)
return roc_auc
def callback(self, study, trial):
if study.best_trial == trial:
self.best_booster = self._booster
if __name__ == "__main__":
study, objective = run_experiment(
n_trials=100,
dataset=load_breast_cancer(as_frame=True).frame,
name="Data1",
)
best_model = get_study_results(
study=study,
objective=objective,
name="Data1",
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment