GINK03/lightgbm-kfold-nlp.md

## lightgbm-kfold-nlp.md

      
    Raw
  

              lightgbm-kfold-nlp.md
            
          
    scipy sparseのlilでくみたててcsrに変換し、np.float32なら学習できる
※ : https://github.com/GINK03/ai_news/blob/master/cold_start_modeling/train.ipynb
import pickle
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
lil = pickle.load(open('lil.pkl', 'rb'))
csr  = csr_matrix(lil).astype(np.float32)
target = pd.read_csv('target.csv')['target']

print(lil.shape)
print(lil.nnz)

import numpy as np
from math import sqrt
import lightgbm as lgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def get_oof(clf, x_train, y, x_test):
    train_h, train_w = x_train.shape
    test_h, test_w = x_test.shape
    NFOLDS=5
    SEED=71
    kf = KFold(train_h, n_folds=NFOLDS, shuffle=True, random_state=SEED)
    oof_train = np.zeros((train_h,))
    oof_test = np.zeros((test_h,))
    oof_test_skf = np.empty((NFOLDS, test_h))
    lgbm_params =  {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        # 'max_depth': 15,
        'num_leaves': 100,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.75,
        'bagging_freq': 4,
        'learning_rate': 0.016,
        #'max_bin':1023,
        'verbose': 0
    }
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        y_te = y[test_index]
        x_te = x_train[test_index]
        lgtrain = lgb.Dataset(x_tr, y_tr)
                    #feature_name=x_train.columns.tolist())
        lgvalid = lgb.Dataset(x_te, y_te)
                    #feature_name=x_train.columns.tolist())
                    #categorical_feature = categorical)
        lgb_clf = lgb.train(
            lgbm_params,
            lgtrain,
            num_boost_round=2000,
            valid_sets=[lgtrain, lgvalid],
            valid_names=['train','valid'],
            early_stopping_rounds=50,
            verbose_eval=50
        )
        oof_train[test_index] = lgb_clf.predict(x_te)
        oof_test_skf[i, :]    = lgb_clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train, oof_test
#Ridge oof method from Faron's kernel
#I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
#It doesn't really add much to the score, but it does help lightgbm converge faster
oof_train, oof_test = get_oof(None, csr, target, csr)
rms = sqrt(mean_squared_error(target, oof_train))
print('LGB OOF RMSE: {}'.format(rms))
mae = sqrt(mean_absolute_error(target, oof_train))
print('LGB OOF MAE: {}'.format(mae))
print("Modeling Stage")