lightgbmと基本的に同じ
xgb用のパラメータになっているので必要に応じてlgbのパラメータの言い換えを見つける
import xgboost as xgb
def get_oof(clf, x_train, y, x_test):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
xgb_params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'reg:linear',
'metric': 'rmse',
'max_depth': 15,
#'num_leaves': 270,
#'feature_fraction': 0.5,
#'bagging_fraction': 0.75,
#'bagging_freq': 4,
'learning_rate': 0.016*10,
'subsample' : 0.8,
'colsample_bytree' : 0.7,
#'max_bin':1023,
'verbose': 0
}
x_test = xgb.DMatrix(x_test, feature_names=vectorizer.get_feature_names())
for i, (train_index, test_index) in enumerate(kf):
print('\nFold {}'.format(i))
x_tr = x_train[train_index]
y_tr = y[train_index]
y_te = y[test_index]
x_te = x_train[test_index]
xgtrain = xgb.DMatrix(x_tr, y_tr,
feature_names=vectorizer.get_feature_names())
xgvalid = xgb.DMatrix(x_te, y_te,
feature_names=vectorizer.get_feature_names())
#categorical_feature = categorical)
xgb_clf = xgb.train(
xgb_params,
xgtrain,
num_boost_round=20000,
evals =[(xgtrain,'train'), (xgvalid,'valid')],
#valid_names=['train','valid'],
early_stopping_rounds=50,
verbose_eval=50
)
oof_train[test_index] = xgb_clf.predict( xgb.DMatrix(x_te, feature_names=vectorizer.get_feature_names()) )
oof_test_skf[i, :] = xgb_clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
#Ridge oof method from Faron's kernel
#I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
#It doesn't really add much to the score, but it does help lightgbm converge faster
oof_train, oof_test = get_oof(None, ready_df[:ntrain], y, ready_df[ntrain:])
rms = sqrt(mean_squared_error(y, oof_train))
print('XGB OOF RMSE: {}'.format(rms))
print("Modeling Stage")
preds = np.concatenate([oof_train, oof_test])
df['xgb_preds'] = preds
df['xgb_preds'].to_csv('tmp/xgb_preds_all.csv', index=None)