yudhiesh/optuna_lightgbm.py

## optuna_lightgbm.py
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback
from sklearn.metrics import log_loss, roc_auc_score

def get_features_labels(df, target_columns):
    """
    Splits the dataframe into features and labels
    returns: features and labels
    """
    feature_columns = [columns for columns in df.columns.tolist() if columns != target_columns]
    X = df.loc[:, feature_columns]
    y = df.loc[:, target_columns]
    return X, y

def get_train_valid(train, valid):
    """
    Extracts the features and labels from the train and valid datasets
    returns: the  features and labels for train and valid
    """
    X_train, y_train = get_features_labels(train)
    X_valid, y_valid = get_features_labels(valid)
    return (X_train, y_train, X_valid, y_valid)

def get_study_results(study, objective, name):
    """Get the results of an Optuna stufy based on the study name and the objective instance"""
    print(f"Best trial - {name}:")
    trial = study.best_trial

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    best_model = objective.best_booster
    print(f"Best validation score: {best_model.best_score_}")
    return best_model

def run_experiment(n_trials, train, valid, name):
    """
    Runs an experiment for n_trials using the train & valid dataset
    returns: the study and objective instance
    """
    print(f"Running experiment for : {name.title()}")
    objective = Objective(train=train, valid=valid)
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
        direction="maximize",
    )
    study.optimize(objective, n_trials=n_trials, callbacks=[objective.callback])
    return study, objective

class Objective:
    """
    Objective class to perform hyperparameter tuning using Optuna and keep track of the best model
    using the callback function.
    Takes as input the train and valid dataset to be used and returns the roc_auc score per trial
    which will be passed to an Optuna study
    """

    def __init__(self, train, valid):
        self.best_booster = None
        self._booster = None
        self.train = train
        self.valid = valid
        self.X_train, self.y_train, self.X_valid, self.y_valid = get_train_valid(
            train=self.train,
            valid=self.valid,
        )

    def __call__(self, trial):

        param_grid = {
            "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_data_in_leaf": trial.suggest_int(
                "min_data_in_leaf", 200, 10000, step=100
            ),
            "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
            "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
            "bagging_fraction": trial.suggest_float(
                "bagging_fraction", 0.2, 0.95, step=0.1
            ),
            "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
            "feature_fraction": trial.suggest_float(
                "feature_fraction", 0.2, 0.95, step=0.1
            ),
        }
        model = lgb.LGBMClassifier(objective="binary", **param_grid)
        model.fit(
            self.X_train,
            self.y_train,
            eval_set=[(self.X_valid, self.y_valid)],
            eval_metric=["auc", "binary_logloss"],
            early_stopping_rounds=100,
            callbacks=[LightGBMPruningCallback(trial, "auc")],
        )

        self._booster = model

        preds = model.predict_proba(self.X_valid)[:, 1]
        roc_auc = roc_auc_score(self.y_valid, preds)
        return roc_auc

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_booster = self._booster

# Sample use case
# Pass in the train and validation data with the name of the experiment
study1, objective1 = run_experiment(n_trials=100, train=data1_train, valid=data1_valid, name="Data1")
# Get the results of the study and the best model from it
best_model1 = get_study_results(study=study1, objective=objective1, name="Data1")
	import pandas as pd
	import numpy as np
	import lightgbm as lgb
	import optuna
	from optuna.integration import LightGBMPruningCallback
	from sklearn.metrics import log_loss, roc_auc_score

	def get_features_labels(df, target_columns):
	"""
	Splits the dataframe into features and labels
	returns: features and labels
	"""
	feature_columns = [columns for columns in df.columns.tolist() if columns != target_columns]
	X = df.loc[:, feature_columns]
	y = df.loc[:, target_columns]
	return X, y

	def get_train_valid(train, valid):
	"""
	Extracts the features and labels from the train and valid datasets
	returns: the features and labels for train and valid
	"""
	X_train, y_train = get_features_labels(train)
	X_valid, y_valid = get_features_labels(valid)
	return (X_train, y_train, X_valid, y_valid)

	def get_study_results(study, objective, name):
	"""Get the results of an Optuna stufy based on the study name and the objective instance"""
	print(f"Best trial - {name}:")
	trial = study.best_trial

	print(" Params: ")
	for key, value in trial.params.items():
	print(" {}: {}".format(key, value))
	best_model = objective.best_booster
	print(f"Best validation score: {best_model.best_score_}")
	return best_model

	def run_experiment(n_trials, train, valid, name):
	"""
	Runs an experiment for n_trials using the train & valid dataset
	returns: the study and objective instance
	"""
	print(f"Running experiment for : {name.title()}")
	objective = Objective(train=train, valid=valid)
	study = optuna.create_study(
	pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
	direction="maximize",
	)
	study.optimize(objective, n_trials=n_trials, callbacks=[objective.callback])
	return study, objective

	class Objective:
	"""
	Objective class to perform hyperparameter tuning using Optuna and keep track of the best model
	using the callback function.
	Takes as input the train and valid dataset to be used and returns the roc_auc score per trial
	which will be passed to an Optuna study
	"""

	def __init__(self, train, valid):
	self.best_booster = None
	self._booster = None
	self.train = train
	self.valid = valid
	self.X_train, self.y_train, self.X_valid, self.y_valid = get_train_valid(
	train=self.train,
	valid=self.valid,
	)

	def __call__(self, trial):

	param_grid = {
	"n_estimators": trial.suggest_categorical("n_estimators", [10000]),
	"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
	"num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
	"max_depth": trial.suggest_int("max_depth", 3, 12),
	"min_data_in_leaf": trial.suggest_int(
	"min_data_in_leaf", 200, 10000, step=100
	),
	"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
	"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
	"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
	"bagging_fraction": trial.suggest_float(
	"bagging_fraction", 0.2, 0.95, step=0.1
	),
	"bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
	"feature_fraction": trial.suggest_float(
	"feature_fraction", 0.2, 0.95, step=0.1
	),
	}
	model = lgb.LGBMClassifier(objective="binary", **param_grid)
	model.fit(
	self.X_train,
	self.y_train,
	eval_set=[(self.X_valid, self.y_valid)],
	eval_metric=["auc", "binary_logloss"],
	early_stopping_rounds=100,
	callbacks=[LightGBMPruningCallback(trial, "auc")],
	)

	self._booster = model

	preds = model.predict_proba(self.X_valid)[:, 1]
	roc_auc = roc_auc_score(self.y_valid, preds)
	return roc_auc

	def callback(self, study, trial):
	if study.best_trial == trial:
	self.best_booster = self._booster

	# Sample use case
	# Pass in the train and validation data with the name of the experiment
	study1, objective1 = run_experiment(n_trials=100, train=data1_train, valid=data1_valid, name="Data1")
	# Get the results of the study and the best model from it
	best_model1 = get_study_results(study=study1, objective=objective1, name="Data1")