Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save matheushent/2087112ee7776ecfd7a00746e2f30891 to your computer and use it in GitHub Desktop.
Save matheushent/2087112ee7776ecfd7a00746e2f30891 to your computer and use it in GitHub Desktop.
Hyper-parameter optimization of xgboost model for Santander competition
from warnings import filterwarnings
filterwarnings('ignore')
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_convergence
from skopt.utils import use_named_args
from skopt import dump, load
train = pd.read_csv('train.csv')
submissions = pd.read_csv('test.csv')
id_submissions = submissions['ID_code']
submissions.drop('ID_code', axis=1, inplace=True)
y = train['target'].values
train.drop('ID_code', axis=1, inplace=True)
train.drop('target', axis=1, inplace=True)
X = train.values
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
submissions = scaler.fit_transform(submissions)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
dim_boosting_type = Categorical(categories=['gbdt', 'dart', 'goss'], name='boosting_type')
dim_num_leaves = Integer(low=30, high=150, name='num_leaves')
dim_learning_rate = Real(low=1e-4, high=2e-1, prior='log-uniform', name='learning_rate')
dim_subsample_for_bin = Integer(low=20000, high=300000, name='subsample_for_bin')
dim_min_child_samples = Integer(low=20, high=500, name='min_child_samples')
dim_reg_alpha = Real(low=0, high=1, prior='uniform', name='reg_alpha')
dim_reg_lambda = Real(low=0, high=1, prior='uniform', name='reg_lambda')
dim_colsample_bytree = Real(low=0.6, high=1, prior='uniform', name='colsample_bytree')
dim_n_estimators = Integer(low=20, high=1000, name='n_estimators')
dimensions = [dim_boosting_type,
dim_num_leaves,
dim_learning_rate,
dim_subsample_for_bin,
dim_min_child_samples,
dim_reg_alpha,
dim_reg_lambda,
dim_colsample_bytree,
dim_n_estimators]
default_parameters = ['gbdt', 42, 1e-1, 180000, 40, 0.4, 0.5, 0.8, 200]
validation_data = [(X_test, y_test)]
best_accuracy = 0.0
best_parameters = [{'teste': 1}]
@use_named_args(dimensions=dimensions)
def fitness(boosting_type, num_leaves, learning_rate, subsample_for_bin,
min_child_samples, reg_alpha, reg_lambda, colsample_bytree,
n_estimators):
print("boosting type:", boosting_type)
print("num leaves:", num_leaves)
print("learning rate: {:.7f}".format(learning_rate))
print("subsample for bin:", subsample_for_bin)
print("min child samples:", min_child_samples)
print("reg alpha: {:.7f}".format(reg_alpha))
print("reg lambda: {:.7f}".format(reg_lambda))
print("colsample bytree: {:.7f}".format(colsample_bytree))
print("n estimators:", n_estimators)
print()
model = LGBMClassifier(boosting_type=boosting_type, num_leaves=num_leaves, learning_rate=learning_rate,
subsample_for_bin=subsample_for_bin, min_child_samples=min_child_samples, reg_alpha=reg_alpha,
reg_lambda=reg_lambda, colsample_bytree=colsample_bytree, n_estimators=n_estimators)
fitted_model = model.fit(X_train, y_train, eval_set=validation_data, eval_metric='logloss', verbose=40)
pred = model.predict_proba(X_test)[:, 1]
score = roc_auc_score(y_test, pred)
print()
print("AUC score: {:.8f}".format(score))
print()
global best_accuracy
global best_parameters
if score > best_accuracy:
best_parameters[0] = model.get_params()
best_accuracy = score
del model
return -score
# # Let's do this
search_result = gp_minimize(func=fitness,
dimensions=dimensions,
acq_func='EI',
n_calls=40,
x0=default_parameters)
print()
print(best_accuracy)
print()
print()
print(best_parameters[0])
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment