from bayes_opt import BayesianOptimization from sklearn.cross_validation import KFold import xgboost as xgb def xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample): # prepare xgb parameters params = { "objective": "reg:linear", "booster" : "gbtree", "eval_metric": "mae", "tree_method": 'auto', "silent": 1, "eta": eta, "max_depth": int(maxDepth), "min_child_weight" : minChildWeight, "subsample": subsample, "colsample_bytree": colSample, "gamma": gamma } cvScore = kFoldValidation(train, features, params, int(numRounds), nFolds = 3) print('CV score: {:.6f}'.format(cvScore)) return -1.0 * cvScore # invert the cv score to let bayopt maximize def bayesOpt(train, features): ranges = { 'numRounds': (1000, 5000), 'eta': (0.001, 0.3), 'gamma': (0, 25), 'maxDepth': (1, 10), 'minChildWeight': (0, 10), 'subsample': (0, 1), 'colSample': (0, 1) } # proxy through a lambda to be able to pass train and features optFunc = lambda numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample: xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample) bo = BayesianOptimization(optFunc, ranges) bo.maximize(init_points = 50, n_iter = 5, kappa = 2, acq = "ei", xi = 0.0) bestMAE = round((-1.0 * bo.res['max']['max_val']), 6) print("\n Best MAE found: %f" % bestMAE) print("\n Parameters: %s" % bo.res['max']['max_params']) def kFoldValidation(train, features, xgbParams, numRounds, nFolds, target='loss'): kf = KFold(len(train), n_folds = nFolds, shuffle = True) fold_score=[] for train_index, cv_index in kf: # split train/validation X_train, X_valid = train[features].as_matrix()[train_index], train[features].as_matrix()[cv_index] y_train, y_valid = (train[target].as_matrix()[train_index]), (train[target].as_matrix()[cv_index]) dtrain = xgb.DMatrix(X_train, y_train) dvalid = xgb.DMatrix(X_valid, y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(xgbParams, dtrain, numRounds, evals = watchlist, early_stopping_rounds = 100) score = gbm.best_score fold_score.append(score) return np.mean(fold_score)