scipy sparseのlilでくみたててcsrに変換し、np.float32なら学習できる
※ : https://github.com/GINK03/ai_news/blob/master/cold_start_modeling/train.ipynb
import pickle
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
lil = pickle.load(open('lil.pkl', 'rb'))
csr = csr_matrix(lil).astype(np.float32)
target = pd.read_csv('target.csv')['target']
print(lil.shape)
print(lil.nnz)
import numpy as np
from math import sqrt
import lightgbm as lgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def get_oof(clf, x_train, y, x_test):
train_h, train_w = x_train.shape
test_h, test_w = x_test.shape
NFOLDS=5
SEED=71
kf = KFold(train_h, n_folds=NFOLDS, shuffle=True, random_state=SEED)
oof_train = np.zeros((train_h,))
oof_test = np.zeros((test_h,))
oof_test_skf = np.empty((NFOLDS, test_h))
lgbm_params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
# 'max_depth': 15,
'num_leaves': 100,
'feature_fraction': 0.9,
'bagging_fraction': 0.75,
'bagging_freq': 4,
'learning_rate': 0.016,
#'max_bin':1023,
'verbose': 0
}
for i, (train_index, test_index) in enumerate(kf):
print('\nFold {}'.format(i))
x_tr = x_train[train_index]
y_tr = y[train_index]
y_te = y[test_index]
x_te = x_train[test_index]
lgtrain = lgb.Dataset(x_tr, y_tr)
#feature_name=x_train.columns.tolist())
lgvalid = lgb.Dataset(x_te, y_te)
#feature_name=x_train.columns.tolist())
#categorical_feature = categorical)
lgb_clf = lgb.train(
lgbm_params,
lgtrain,
num_boost_round=2000,
valid_sets=[lgtrain, lgvalid],
valid_names=['train','valid'],
early_stopping_rounds=50,
verbose_eval=50
)
oof_train[test_index] = lgb_clf.predict(x_te)
oof_test_skf[i, :] = lgb_clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train, oof_test
#Ridge oof method from Faron's kernel
#I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
#It doesn't really add much to the score, but it does help lightgbm converge faster
oof_train, oof_test = get_oof(None, csr, target, csr)
rms = sqrt(mean_squared_error(target, oof_train))
print('LGB OOF RMSE: {}'.format(rms))
mae = sqrt(mean_absolute_error(target, oof_train))
print('LGB OOF MAE: {}'.format(mae))
print("Modeling Stage")