Created
February 20, 2019 23:59
-
-
Save matheushent/2087112ee7776ecfd7a00746e2f30891 to your computer and use it in GitHub Desktop.
Hyper-parameter optimization of xgboost model for Santander competition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from warnings import filterwarnings | |
filterwarnings('ignore') | |
import pandas as pd | |
import numpy as np | |
from lightgbm import LGBMClassifier | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import roc_auc_score | |
import skopt | |
from skopt import gp_minimize, forest_minimize | |
from skopt.space import Real, Integer, Categorical | |
from skopt.plots import plot_convergence | |
from skopt.utils import use_named_args | |
from skopt import dump, load | |
train = pd.read_csv('train.csv') | |
submissions = pd.read_csv('test.csv') | |
id_submissions = submissions['ID_code'] | |
submissions.drop('ID_code', axis=1, inplace=True) | |
y = train['target'].values | |
train.drop('ID_code', axis=1, inplace=True) | |
train.drop('target', axis=1, inplace=True) | |
X = train.values | |
scaler = MinMaxScaler() | |
X = scaler.fit_transform(X) | |
submissions = scaler.fit_transform(submissions) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) | |
dim_boosting_type = Categorical(categories=['gbdt', 'dart', 'goss'], name='boosting_type') | |
dim_num_leaves = Integer(low=30, high=150, name='num_leaves') | |
dim_learning_rate = Real(low=1e-4, high=2e-1, prior='log-uniform', name='learning_rate') | |
dim_subsample_for_bin = Integer(low=20000, high=300000, name='subsample_for_bin') | |
dim_min_child_samples = Integer(low=20, high=500, name='min_child_samples') | |
dim_reg_alpha = Real(low=0, high=1, prior='uniform', name='reg_alpha') | |
dim_reg_lambda = Real(low=0, high=1, prior='uniform', name='reg_lambda') | |
dim_colsample_bytree = Real(low=0.6, high=1, prior='uniform', name='colsample_bytree') | |
dim_n_estimators = Integer(low=20, high=1000, name='n_estimators') | |
dimensions = [dim_boosting_type, | |
dim_num_leaves, | |
dim_learning_rate, | |
dim_subsample_for_bin, | |
dim_min_child_samples, | |
dim_reg_alpha, | |
dim_reg_lambda, | |
dim_colsample_bytree, | |
dim_n_estimators] | |
default_parameters = ['gbdt', 42, 1e-1, 180000, 40, 0.4, 0.5, 0.8, 200] | |
validation_data = [(X_test, y_test)] | |
best_accuracy = 0.0 | |
best_parameters = [{'teste': 1}] | |
@use_named_args(dimensions=dimensions) | |
def fitness(boosting_type, num_leaves, learning_rate, subsample_for_bin, | |
min_child_samples, reg_alpha, reg_lambda, colsample_bytree, | |
n_estimators): | |
print("boosting type:", boosting_type) | |
print("num leaves:", num_leaves) | |
print("learning rate: {:.7f}".format(learning_rate)) | |
print("subsample for bin:", subsample_for_bin) | |
print("min child samples:", min_child_samples) | |
print("reg alpha: {:.7f}".format(reg_alpha)) | |
print("reg lambda: {:.7f}".format(reg_lambda)) | |
print("colsample bytree: {:.7f}".format(colsample_bytree)) | |
print("n estimators:", n_estimators) | |
print() | |
model = LGBMClassifier(boosting_type=boosting_type, num_leaves=num_leaves, learning_rate=learning_rate, | |
subsample_for_bin=subsample_for_bin, min_child_samples=min_child_samples, reg_alpha=reg_alpha, | |
reg_lambda=reg_lambda, colsample_bytree=colsample_bytree, n_estimators=n_estimators) | |
fitted_model = model.fit(X_train, y_train, eval_set=validation_data, eval_metric='logloss', verbose=40) | |
pred = model.predict_proba(X_test)[:, 1] | |
score = roc_auc_score(y_test, pred) | |
print() | |
print("AUC score: {:.8f}".format(score)) | |
print() | |
global best_accuracy | |
global best_parameters | |
if score > best_accuracy: | |
best_parameters[0] = model.get_params() | |
best_accuracy = score | |
del model | |
return -score | |
# # Let's do this | |
search_result = gp_minimize(func=fitness, | |
dimensions=dimensions, | |
acq_func='EI', | |
n_calls=40, | |
x0=default_parameters) | |
print() | |
print(best_accuracy) | |
print() | |
print() | |
print(best_parameters[0]) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment