Last active
February 2, 2020 18:44
-
-
Save aialenti/ad42b552524ad2440245f1831915a813 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Imports | |
import numpy as np | |
import pandas as pd | |
import xgboost as xgb | |
from interruptingcow import timeout | |
from sklearn.model_selection import KFold # import KFold | |
from sklearn.metrics import r2_score | |
import json | |
from preprocessing import preprocessing, xgb_r2_score # The preprocessing and the r2 evaluation | |
from generate_grid import get_grid_iterable # The grid | |
def grid_search(timeout_seconds, cv_splits, boosting_rounds): | |
# Read input data | |
X, y = preprocessing() | |
# Create dataframe to collect the results | |
tests_columns = ["test_nr", "cv_mean", "cv_min", "cv_max", "cv_median", "params"] | |
test_id = 0 | |
tests = pd.DataFrame(columns=tests_columns) | |
# Cross validation number of splits | |
kf = KFold(n_splits=cv_splits) | |
# Execute until timeout occurs | |
with timeout(timeout_seconds, exception=RuntimeError): | |
# Get the grid | |
grid_iter, keys, length = get_grid_iterable() | |
try: | |
# For every element of the grid | |
for df_grid in grid_iter: | |
# Prepare a list to collect the scores | |
score = [] | |
params = dict(zip(keys, df_grid)) | |
# The objective function | |
params["objective"] = "reg:squarederror" | |
# For each fold, train XGBoost and spit out the results | |
for train_index, test_index in kf.split(X.values): | |
# Get X train and X test | |
X_train, X_test = X.iloc[train_index], X.iloc[test_index] | |
# Get y train and y test | |
y_train, y_test = y.iloc[train_index], y.iloc[test_index] | |
# Convert into DMatrix | |
d_train = xgb.DMatrix(X_train, label=y_train) | |
d_valid = xgb.DMatrix(X_test, label=y_test) | |
d_test = xgb.DMatrix(X_test) | |
watchlist = [(d_train, 'train'), (d_valid, 'valid')] | |
# Create the classifier using the current grid params. Apply early stopping of 50 rounds | |
clf = xgb.train(params, d_train, boosting_rounds, watchlist, early_stopping_rounds=50, | |
feval=xgb_r2_score, maximize=True, verbose_eval=10) | |
y_hat = clf.predict(d_test) | |
# Append Scores on the fold kept out | |
score.append(r2_score(y_test, y_hat)) | |
# Store the result into a dataframe | |
score_df = pd.DataFrame(columns=tests_columns, data=[ | |
[test_id, np.mean(score), np.min(score), np.max(score), np.median(score), | |
json.dumps(dict(zip(keys, [str(g) for g in df_grid])))]]) | |
test_id += 1 | |
tests = pd.concat([tests, score_df]) | |
except RuntimeError: | |
# When timeout occurs an exception is raised and the main cycle is broken | |
pass | |
# Spit out the results | |
tests.to_csv("grid-search.csv", index=False) | |
print(tests) | |
if __name__ == "__main__": | |
grid_search(timeout_seconds=3600, cv_splits=4, boosting_rounds=500) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment