steermomo/optimizing_LightGBM.py

## optimizing_LightGBM.py
# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Our example dataset
from sklearn.datasets import load_boston

# Classifiers
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# Hyperparameters distributions
from scipy.stats import randint
from scipy.stats import uniform

# Model selection
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score

# Metrics
from sklearn.metrics import average_precision_score
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta


# Reporting util for different optimizers
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers

    optimizer = a sklearn or a skopt optimizer
    X = the training set
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    best_score = optimizer.best_score_
    best_score_std = optimizer.cv_results_['std_test_score'][optimizer.best_index_]
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start,
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

 # Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# Converting average precision score into a scorer suitable for model selection
avg_prec = make_scorer(average_precision_score, greater_is_better=True, needs_proba=True)

clf = lgb.LGBMClassifier(boosting_type='gbdt',
                         class_weight='balanced',
                         objective='binary',
                         n_jobs=1,
                         verbose=0)

search_spaces = {
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'num_leaves': Integer(2, 500),
        'max_depth': Integer(0, 500),
        'min_child_samples': Integer(0, 200), # minimal number of data in one leaf
        'max_bin': Integer(100, 100000), # max number of bins that feature values will be bucketed
        'subsample': Real(0.01, 1.0, 'uniform'),
        'subsample_freq': Integer(0, 10), # bagging fraction
        'colsample_bytree': Real(0.01, 1.0, 'uniform'), # enabler of bagging fraction
        'min_child_weight': Integer(0, 10), # minimal number of data in one leaf.
        'subsample_for_bin': Integer(100000, 500000), # number of data that sampled for histogram bins
        'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization
        'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization
        'scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
        'n_estimators': Integer(10, 10000)
        }

opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=avg_prec,
                    cv=skf,
                    n_iter=40,
                    n_jobs=-1,
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22)

best_params = report_perf(opt, X, y_bin,'LightGBM',
                          callbacks=[DeltaXStopper(0.001),
                                     DeadlineStopper(60*5)])


# XGBoost
clf = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        silent=1,
        tree_method='approx')
search_spaces = {'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'min_child_weight': Integer(0, 10),
                 'max_depth': Integer(0, 50),
                 'max_delta_step': Integer(0, 20), # Maximum delta step we allow each leaf output
                 'subsample': Real(0.01, 1.0, 'uniform'),
                 'colsample_bytree': Real(0.01, 1.0, 'uniform'), # subsample ratio of columns by tree
                 'colsample_bylevel': Real(0.01, 1.0, 'uniform'), # subsample ratio by level in trees
                 'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization
                 'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization
                 'gamma': Real(1e-9, 0.5, 'log-uniform'), # Minimum loss reduction for partition
                 'n_estimators': Integer(50, 100),
                 'scale_pos_weight': Real(1e-6, 500, 'log-uniform')}

opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=avg_prec,
                    cv=skf,
                    n_iter=40,
                    n_jobs=-1,
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22)

best_params = report_perf(opt, X, y_bin,'XGBoost',
                          callbacks=[DeltaXStopper(0.001),
                                     DeadlineStopper(60*5)])

# cat boost
clf = CatBoostClassifier(loss_function='Logloss',
                         verbose = False)
search_spaces = {'iterations': Integer(10, 100),
                 'depth': Integer(1, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'), # randomness for scoring splits
                 'bagging_temperature': Real(0.0, 1.0), # settings of the Bayesian bootstrap
                 'border_count': Integer(1, 255), # splits for numerical features
                 'l2_leaf_reg': Integer(2, 30), # L2 regularization
                 'scale_pos_weight':Real(0.01, 10.0, 'uniform')}

opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=avg_prec,
                    cv=skf,
                    n_iter=40,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22)

best_params = report_perf(opt, X, y_bin,'CatBoost',
                          callbacks=[DeltaXStopper(0.001),
                                     DeadlineStopper(60*5)])
	# Importing core libraries
	import numpy as np
	import pandas as pd
	from time import time
	import pprint
	import joblib

	# Suppressing warnings because of skopt verbosity
	import warnings
	warnings.filterwarnings("ignore")

	# Our example dataset
	from sklearn.datasets import load_boston

	# Classifiers
	from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.pipeline import Pipeline
	import lightgbm as lgb
	import xgboost as xgb
	from catboost import CatBoostClassifier

	# Hyperparameters distributions
	from scipy.stats import randint
	from scipy.stats import uniform

	# Model selection
	from sklearn.model_selection import StratifiedKFold
	from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
	from sklearn.model_selection import cross_val_score

	# Metrics
	from sklearn.metrics import average_precision_score
	from sklearn.metrics import make_scorer

	# Skopt functions
	from skopt import BayesSearchCV
	from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
	from skopt.space import Real, Categorical, Integer
	from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
	from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
	from skopt.callbacks import VerboseCallback # Callback to control the verbosity
	from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta


	# Reporting util for different optimizers
	def report_perf(optimizer, X, y, title, callbacks=None):
	"""
	A wrapper for measuring time and performances of different optmizers

	optimizer = a sklearn or a skopt optimizer
	X = the training set
	y = our target
	title = a string label for the experiment
	"""
	start = time()
	if callbacks:
	optimizer.fit(X, y, callback=callbacks)
	else:
	optimizer.fit(X, y)
	best_score = optimizer.best_score_
	best_score_std = optimizer.cv_results_['std_test_score'][optimizer.best_index_]
	best_params = optimizer.best_params_
	print((title + " took %.2f seconds, candidates checked: %d, best CV score: %.3f "
	+u"\u00B1"+" %.3f") % (time() - start,
	len(optimizer.cv_results_['params']),
	best_score,
	best_score_std))
	print('Best parameters:')
	pprint.pprint(best_params)
	print()
	return best_params

	# Setting a 5-fold stratified cross-validation (note: shuffle=True)
	skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
	# Converting average precision score into a scorer suitable for model selection
	avg_prec = make_scorer(average_precision_score, greater_is_better=True, needs_proba=True)

	clf = lgb.LGBMClassifier(boosting_type='gbdt',
	class_weight='balanced',
	objective='binary',
	n_jobs=1,
	verbose=0)

	search_spaces = {
	'learning_rate': Real(0.01, 1.0, 'log-uniform'),
	'num_leaves': Integer(2, 500),
	'max_depth': Integer(0, 500),
	'min_child_samples': Integer(0, 200), # minimal number of data in one leaf
	'max_bin': Integer(100, 100000), # max number of bins that feature values will be bucketed
	'subsample': Real(0.01, 1.0, 'uniform'),
	'subsample_freq': Integer(0, 10), # bagging fraction
	'colsample_bytree': Real(0.01, 1.0, 'uniform'), # enabler of bagging fraction
	'min_child_weight': Integer(0, 10), # minimal number of data in one leaf.
	'subsample_for_bin': Integer(100000, 500000), # number of data that sampled for histogram bins
	'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization
	'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization
	'scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
	'n_estimators': Integer(10, 10000)
	}

	opt = BayesSearchCV(clf,
	search_spaces,
	scoring=avg_prec,
	cv=skf,
	n_iter=40,
	n_jobs=-1,
	return_train_score=False,
	refit=True,
	optimizer_kwargs={'base_estimator': 'GP'},
	random_state=22)

	best_params = report_perf(opt, X, y_bin,'LightGBM',
	callbacks=[DeltaXStopper(0.001),
	DeadlineStopper(60*5)])


	# XGBoost
	clf = xgb.XGBClassifier(
	n_jobs = 1,
	objective = 'binary:logistic',
	silent=1,
	tree_method='approx')
	search_spaces = {'learning_rate': Real(0.01, 1.0, 'log-uniform'),
	'min_child_weight': Integer(0, 10),
	'max_depth': Integer(0, 50),
	'max_delta_step': Integer(0, 20), # Maximum delta step we allow each leaf output
	'subsample': Real(0.01, 1.0, 'uniform'),
	'colsample_bytree': Real(0.01, 1.0, 'uniform'), # subsample ratio of columns by tree
	'colsample_bylevel': Real(0.01, 1.0, 'uniform'), # subsample ratio by level in trees
	'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization
	'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization
	'gamma': Real(1e-9, 0.5, 'log-uniform'), # Minimum loss reduction for partition
	'n_estimators': Integer(50, 100),
	'scale_pos_weight': Real(1e-6, 500, 'log-uniform')}

	opt = BayesSearchCV(clf,
	search_spaces,
	scoring=avg_prec,
	cv=skf,
	n_iter=40,
	n_jobs=-1,
	return_train_score=False,
	refit=True,
	optimizer_kwargs={'base_estimator': 'GP'},
	random_state=22)

	best_params = report_perf(opt, X, y_bin,'XGBoost',
	callbacks=[DeltaXStopper(0.001),
	DeadlineStopper(60*5)])

	# cat boost
	clf = CatBoostClassifier(loss_function='Logloss',
	verbose = False)
	search_spaces = {'iterations': Integer(10, 100),
	'depth': Integer(1, 8),
	'learning_rate': Real(0.01, 1.0, 'log-uniform'),
	'random_strength': Real(1e-9, 10, 'log-uniform'), # randomness for scoring splits
	'bagging_temperature': Real(0.0, 1.0), # settings of the Bayesian bootstrap
	'border_count': Integer(1, 255), # splits for numerical features
	'l2_leaf_reg': Integer(2, 30), # L2 regularization
	'scale_pos_weight':Real(0.01, 10.0, 'uniform')}

	opt = BayesSearchCV(clf,
	search_spaces,
	scoring=avg_prec,
	cv=skf,
	n_iter=40,
	n_jobs=1, # use just 1 job with CatBoost in order to avoid segmentation fault
	return_train_score=False,
	refit=True,
	optimizer_kwargs={'base_estimator': 'GP'},
	random_state=22)

	best_params = report_perf(opt, X, y_bin,'CatBoost',
	callbacks=[DeltaXStopper(0.001),
	DeadlineStopper(60*5)])