GINK03/xgboost-kfold.md

## xgboost-kfold.md

      
    Raw
  

              xgboost-kfold.md
            
          
    lightgbmと基本的に同じ
xgb用のパラメータになっているので必要に応じてlgbのパラメータの言い換えを見つける
import xgboost as xgb
def get_oof(clf, x_train, y, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    xgb_params =  {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'reg:linear',
        'metric': 'rmse',
        'max_depth': 15,
        #'num_leaves': 270,
        #'feature_fraction': 0.5,
        #'bagging_fraction': 0.75,
        #'bagging_freq': 4,
        'learning_rate': 0.016*10, 
        'subsample' : 0.8,
        'colsample_bytree' : 0.7,
        #'max_bin':1023,
        'verbose': 0
    }  
    x_test = xgb.DMatrix(x_test, feature_names=vectorizer.get_feature_names())
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        y_te = y[test_index]
        x_te =  x_train[test_index]
        xgtrain = xgb.DMatrix(x_tr, y_tr,
                    feature_names=vectorizer.get_feature_names())
        xgvalid = xgb.DMatrix(x_te, y_te,
                    feature_names=vectorizer.get_feature_names())
                    #categorical_feature = categorical)
        xgb_clf = xgb.train(
            xgb_params,
            xgtrain,
            num_boost_round=20000,
            evals =[(xgtrain,'train'), (xgvalid,'valid')],
            #valid_names=['train','valid'],
            early_stopping_rounds=50,
            verbose_eval=50
        )
        oof_train[test_index] = xgb_clf.predict(  xgb.DMatrix(x_te, feature_names=vectorizer.get_feature_names()) ) 
        oof_test_skf[i, :]    = xgb_clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

#Ridge oof method from Faron's kernel
#I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
#It doesn't really add much to the score, but it does help lightgbm converge faster
oof_train, oof_test = get_oof(None, ready_df[:ntrain], y, ready_df[ntrain:])
rms = sqrt(mean_squared_error(y, oof_train))
print('XGB OOF RMSE: {}'.format(rms))
print("Modeling Stage")
preds = np.concatenate([oof_train, oof_test])
df['xgb_preds'] = preds

df['xgb_preds'].to_csv('tmp/xgb_preds_all.csv', index=None)
参考 : http://xgboost.readthedocs.io/en/latest/parameter.html