Skip to content

Instantly share code, notes, and snippets.

@steermomo
Last active January 15, 2021 19:14
Show Gist options
  • Save steermomo/ac3c57202d6e9e4b84a1bdad16fe3ba6 to your computer and use it in GitHub Desktop.
Save steermomo/ac3c57202d6e9e4b84a1bdad16fe3ba6 to your computer and use it in GitHub Desktop.
# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")
# Our example dataset
from sklearn.datasets import load_boston
# Classifiers
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
# Hyperparameters distributions
from scipy.stats import randint
from scipy.stats import uniform
# Model selection
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
# Metrics
from sklearn.metrics import average_precision_score
from sklearn.metrics import make_scorer
# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title, callbacks=None):
"""
A wrapper for measuring time and performances of different optmizers
optimizer = a sklearn or a skopt optimizer
X = the training set
y = our target
title = a string label for the experiment
"""
start = time()
if callbacks:
optimizer.fit(X, y, callback=callbacks)
else:
optimizer.fit(X, y)
best_score = optimizer.best_score_
best_score_std = optimizer.cv_results_['std_test_score'][optimizer.best_index_]
best_params = optimizer.best_params_
print((title + " took %.2f seconds, candidates checked: %d, best CV score: %.3f "
+u"\u00B1"+" %.3f") % (time() - start,
len(optimizer.cv_results_['params']),
best_score,
best_score_std))
print('Best parameters:')
pprint.pprint(best_params)
print()
return best_params
# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# Converting average precision score into a scorer suitable for model selection
avg_prec = make_scorer(average_precision_score, greater_is_better=True, needs_proba=True)
clf = lgb.LGBMClassifier(boosting_type='gbdt',
class_weight='balanced',
objective='binary',
n_jobs=1,
verbose=0)
search_spaces = {
'learning_rate': Real(0.01, 1.0, 'log-uniform'),
'num_leaves': Integer(2, 500),
'max_depth': Integer(0, 500),
'min_child_samples': Integer(0, 200), # minimal number of data in one leaf
'max_bin': Integer(100, 100000), # max number of bins that feature values will be bucketed
'subsample': Real(0.01, 1.0, 'uniform'),
'subsample_freq': Integer(0, 10), # bagging fraction
'colsample_bytree': Real(0.01, 1.0, 'uniform'), # enabler of bagging fraction
'min_child_weight': Integer(0, 10), # minimal number of data in one leaf.
'subsample_for_bin': Integer(100000, 500000), # number of data that sampled for histogram bins
'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization
'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization
'scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
'n_estimators': Integer(10, 10000)
}
opt = BayesSearchCV(clf,
search_spaces,
scoring=avg_prec,
cv=skf,
n_iter=40,
n_jobs=-1,
return_train_score=False,
refit=True,
optimizer_kwargs={'base_estimator': 'GP'},
random_state=22)
best_params = report_perf(opt, X, y_bin,'LightGBM',
callbacks=[DeltaXStopper(0.001),
DeadlineStopper(60*5)])
# XGBoost
clf = xgb.XGBClassifier(
n_jobs = 1,
objective = 'binary:logistic',
silent=1,
tree_method='approx')
search_spaces = {'learning_rate': Real(0.01, 1.0, 'log-uniform'),
'min_child_weight': Integer(0, 10),
'max_depth': Integer(0, 50),
'max_delta_step': Integer(0, 20), # Maximum delta step we allow each leaf output
'subsample': Real(0.01, 1.0, 'uniform'),
'colsample_bytree': Real(0.01, 1.0, 'uniform'), # subsample ratio of columns by tree
'colsample_bylevel': Real(0.01, 1.0, 'uniform'), # subsample ratio by level in trees
'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization
'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization
'gamma': Real(1e-9, 0.5, 'log-uniform'), # Minimum loss reduction for partition
'n_estimators': Integer(50, 100),
'scale_pos_weight': Real(1e-6, 500, 'log-uniform')}
opt = BayesSearchCV(clf,
search_spaces,
scoring=avg_prec,
cv=skf,
n_iter=40,
n_jobs=-1,
return_train_score=False,
refit=True,
optimizer_kwargs={'base_estimator': 'GP'},
random_state=22)
best_params = report_perf(opt, X, y_bin,'XGBoost',
callbacks=[DeltaXStopper(0.001),
DeadlineStopper(60*5)])
# cat boost
clf = CatBoostClassifier(loss_function='Logloss',
verbose = False)
search_spaces = {'iterations': Integer(10, 100),
'depth': Integer(1, 8),
'learning_rate': Real(0.01, 1.0, 'log-uniform'),
'random_strength': Real(1e-9, 10, 'log-uniform'), # randomness for scoring splits
'bagging_temperature': Real(0.0, 1.0), # settings of the Bayesian bootstrap
'border_count': Integer(1, 255), # splits for numerical features
'l2_leaf_reg': Integer(2, 30), # L2 regularization
'scale_pos_weight':Real(0.01, 10.0, 'uniform')}
opt = BayesSearchCV(clf,
search_spaces,
scoring=avg_prec,
cv=skf,
n_iter=40,
n_jobs=1, # use just 1 job with CatBoost in order to avoid segmentation fault
return_train_score=False,
refit=True,
optimizer_kwargs={'base_estimator': 'GP'},
random_state=22)
best_params = report_perf(opt, X, y_bin,'CatBoost',
callbacks=[DeltaXStopper(0.001),
DeadlineStopper(60*5)])
@steermomo
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment