yudhiesh/optuna_lightgbm_mlflow.py

## optuna_lightgbm_mlflow.py
import lightgbm as lgb
import mlflow
import optuna
from optuna.integration import LightGBMPruningCallback
from datetime import datetime
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


def get_train_valid(dataset):
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42
    )
    return X_train, X_test, y_train, y_test


def get_study_results(study, objective, name):
    """Get the results of an Optuna stufy based on the study name and the objective instance"""
    print(f"Best trial - {name}:")
    trial = study.best_trial

    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    best_model = objective.best_booster
    print(f"Best validation score: {best_model.best_score_}")
    return best_model


def run_experiment(n_trials, dataset, name):
    """
    Runs an experiment for n_trials using the train & valid dataset
    returns: the study and objective instance
    """
    print(f"Running experiment for : {name.title()}")
    objective = Objective(dataset=dataset)
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
        direction="maximize",
    )
    study.optimize(objective, n_trials=n_trials, callbacks=[objective.callback])
    return study, objective


class Objective:
    """
    Objective class to perform hyperparameter tuning using Optuna and keep track of the best model
    using the callback function.
    Takes as input the train and valid dataset to be used and returns the roc_auc score per trial
    which will be passed to an Optuna study
    """

    def __init__(self, dataset):
        self.best_booster = None
        self._booster = None
        self.dataset = dataset

    def __call__(self, trial):
        X_train, X_test, y_train, y_test = get_train_valid(self.dataset)
        param_grid = {
            "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_data_in_leaf": trial.suggest_int(
                "min_data_in_leaf", 200, 10000, step=100
            ),
            "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
            "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
            "bagging_fraction": trial.suggest_float(
                "bagging_fraction", 0.2, 0.95, step=0.1
            ),
            "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
            "feature_fraction": trial.suggest_float(
                "feature_fraction", 0.2, 0.95, step=0.1
            ),
        }
        run_name = f"Test_{datetime.now()}"
        with mlflow.start_run(run_name=run_name):
            model = lgb.LGBMClassifier(objective="binary", **param_grid)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_test, y_test)],
                eval_metric=["auc", "binary_logloss"],
                early_stopping_rounds=100,
                callbacks=[LightGBMPruningCallback(trial, "auc")],
            )

            # log params from optuna trial
            mlflow.log_params(trial.params)
            self._booster = model

            y_pred = model.predict_proba(X_test)[:, 1]
            roc_auc = roc_auc_score(y_test, y_pred)
            log_loss_ = log_loss(y_test, y_pred)
            # log metrics here
            mlflow.log_metrics(
                {
                    "log_loss": log_loss_,
                    "roc_auc": roc_auc,
                },
            )
            return roc_auc

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_booster = self._booster


if __name__ == "__main__":
    study, objective = run_experiment(
        n_trials=100,
        dataset=load_breast_cancer(as_frame=True).frame,
        name="Data1",
    )
    best_model = get_study_results(
        study=study,
        objective=objective,
        name="Data1",
    )
	import lightgbm as lgb
	import mlflow
	import optuna
	from optuna.integration import LightGBMPruningCallback
	from datetime import datetime
	from sklearn.metrics import roc_auc_score, log_loss
	from sklearn.datasets import load_breast_cancer
	from sklearn.model_selection import train_test_split


	def get_train_valid(dataset):
	X = dataset.iloc[:, :-1]
	y = dataset.iloc[:, -1]
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.25, random_state=42
	)
	return X_train, X_test, y_train, y_test


	def get_study_results(study, objective, name):
	"""Get the results of an Optuna stufy based on the study name and the objective instance"""
	print(f"Best trial - {name}:")
	trial = study.best_trial

	print(" Params: ")
	for key, value in trial.params.items():
	print(f" {key}: {value}")
	best_model = objective.best_booster
	print(f"Best validation score: {best_model.best_score_}")
	return best_model


	def run_experiment(n_trials, dataset, name):
	"""
	Runs an experiment for n_trials using the train & valid dataset
	returns: the study and objective instance
	"""
	print(f"Running experiment for : {name.title()}")
	objective = Objective(dataset=dataset)
	study = optuna.create_study(
	pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
	direction="maximize",
	)
	study.optimize(objective, n_trials=n_trials, callbacks=[objective.callback])
	return study, objective


	class Objective:
	"""
	Objective class to perform hyperparameter tuning using Optuna and keep track of the best model
	using the callback function.
	Takes as input the train and valid dataset to be used and returns the roc_auc score per trial
	which will be passed to an Optuna study
	"""

	def __init__(self, dataset):
	self.best_booster = None
	self._booster = None
	self.dataset = dataset

	def __call__(self, trial):
	X_train, X_test, y_train, y_test = get_train_valid(self.dataset)
	param_grid = {
	"n_estimators": trial.suggest_categorical("n_estimators", [10000]),
	"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
	"num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
	"max_depth": trial.suggest_int("max_depth", 3, 12),
	"min_data_in_leaf": trial.suggest_int(
	"min_data_in_leaf", 200, 10000, step=100
	),
	"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
	"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
	"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
	"bagging_fraction": trial.suggest_float(
	"bagging_fraction", 0.2, 0.95, step=0.1
	),
	"bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
	"feature_fraction": trial.suggest_float(
	"feature_fraction", 0.2, 0.95, step=0.1
	),
	}
	run_name = f"Test_{datetime.now()}"
	with mlflow.start_run(run_name=run_name):
	model = lgb.LGBMClassifier(objective="binary", **param_grid)
	model.fit(
	X_train,
	y_train,
	eval_set=[(X_test, y_test)],
	eval_metric=["auc", "binary_logloss"],
	early_stopping_rounds=100,
	callbacks=[LightGBMPruningCallback(trial, "auc")],
	)

	# log params from optuna trial
	mlflow.log_params(trial.params)
	self._booster = model

	y_pred = model.predict_proba(X_test)[:, 1]
	roc_auc = roc_auc_score(y_test, y_pred)
	log_loss_ = log_loss(y_test, y_pred)
	# log metrics here
	mlflow.log_metrics(
	{
	"log_loss": log_loss_,
	"roc_auc": roc_auc,
	},
	)
	return roc_auc

	def callback(self, study, trial):
	if study.best_trial == trial:
	self.best_booster = self._booster


	if __name__ == "__main__":
	study, objective = run_experiment(
	n_trials=100,
	dataset=load_breast_cancer(as_frame=True).frame,
	name="Data1",
	)
	best_model = get_study_results(
	study=study,
	objective=objective,
	name="Data1",
	)