Created
August 21, 2018 11:13
-
-
Save zaburo-ch/a410538ab271e68331565ea06731ee61 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from joblib import Parallel, delayed | |
from sklearn import model_selection | |
import lightgbm as lgb | |
import base | |
if __name__ == '__main__': | |
# general settings | |
seed = 1024 | |
eps = 1e-8 | |
kfold_seeds = [2017, 2016, 2015, 2014, 2013] | |
om = base.OutputManager([seed, eps, kfold_seeds]) | |
np.random.seed(seed) | |
# train.h5 and test.h5 is raw input data saved as hdf5 format. | |
train = base.load_df('data/working/train.h5') | |
test = base.load_df('data/working/test.h5') | |
# `groups_40` is a list of lists of column groups. | |
# such like [['f190486d6', '58e2e02e6', 'eeb9cd3aa'...], ...] | |
# I found 88 groups. | |
groups_40 = base.load_pickle('groups_col_40_88.pkl') | |
# `fake` is a bool array based on discussion/61288, whose length is `len(test)`. | |
fake = base.load_series('data/working/fake.h5').values | |
# leak_x_100 is an int64 array which contains "leaked value x 100" or "-1". | |
# Because I don't want to treat the value as float, I multiply 100 and use int. | |
# Its length is `len(train) + len(test)`. | |
leak_x_100 = base.load_series('data/working/leak_x_100_int_2.h5').values | |
use_groups = list(range(len(groups_40))) # use all groups | |
groups_40 = [groups_40[i] for i in use_groups] | |
X = [train.drop(['ID', 'target'], axis=1), test.drop(['ID'], axis=1)] | |
X = pd.concat(X, axis=0, ignore_index=True, sort=False) | |
mask = X < eps | |
X = np.log1p(X) | |
X[mask] = np.nan | |
# aggregation functions for a set of features. | |
set_aggs = { | |
'mean': lambda x: x.mean(axis=1), | |
'var': lambda x: x.var(axis=1), | |
'max': lambda x: x.max(axis=1), | |
'quantile_90': lambda x: x.quantile(q=0.9, axis=1), | |
'median': lambda x: x.median(axis=1), | |
'quantile_10': lambda x: x.quantile(q=0.1, axis=1), | |
'min': lambda x: x.min(axis=1), | |
'nan': lambda x: x.isnull().mean(axis=1) | |
} | |
# Finally, I did not use lag features because they does not improve cv. | |
# def calc_lag(x): | |
# mask = ~x.isnull().values | |
# x = x.values | |
# non_zeros = [x[i][mask[i]] for i in range(x.shape[0])] | |
# lag = [s[0] - s[1] if len(s) >= 2 else np.nan for s in non_zeros] | |
# return lag | |
# def calc_lag_mean(x): | |
# mask = ~x.isnull().values | |
# x = x.values | |
# non_zeros = [x[i][mask[i]] for i in range(x.shape[0])] | |
# lag = [np.mean(s[:-1] - s[1:]) if len(s) >= 2 else np.nan for s in non_zeros] | |
# return lag | |
ema_weight1 = 0.98 ** np.arange(40) | |
ema_weight2 = 0.9 ** np.arange(40) | |
weekday_pos = np.where(np.arange(40) % 7 == 5)[0] | |
wema_weight = 0.98 ** np.arange(len(weekday_pos)) | |
# aggregation functions for a sequence of features. | |
seq_aggs = { | |
'ema1': lambda x: (x * ema_weight1).sum(axis=1) / ((~x.isnull()) * ema_weight1).sum(axis=1), | |
'ema2': lambda x: (x * ema_weight2).sum(axis=1) / ((~x.isnull()) * ema_weight2).sum(axis=1), | |
'wema': lambda x: (x.iloc[:, weekday_pos] * wema_weight).sum(axis=1) / ((~x.iloc[:, weekday_pos].isnull()) * wema_weight).sum(axis=1), | |
# 'latest_lag': calc_lag, | |
# 'mean_lag': calc_lag_mean | |
} | |
all_aggs = dict(set_aggs) | |
all_aggs.update(seq_aggs) | |
def agg_func(data): | |
if data.shape[1] == 40: | |
# We can apply `seq_aggs`. | |
return [(name, f(data)) for name, f in all_aggs.items()] | |
else: | |
return [(name, f(data)) for name, f in set_aggs.items()] | |
feats = {} | |
res = Parallel(n_jobs=-1)([delayed(agg_func)(X[group]) for group in groups_40]) | |
for i, results in zip(use_groups, res): | |
for (name, s) in results: | |
feats[f'group{i}_{name}'] = s | |
feats = pd.DataFrame(feats) | |
over_group_feats = {} | |
for func_name, f in set_aggs.items(): | |
for name in all_aggs.keys(): | |
sub = feats[[col for col in feats.columns if col.endswith('_' + name)]] | |
over_group_feats[f'over_group_{func_name}_{name}'] = f(sub) | |
over_group_feats = pd.DataFrame(over_group_feats) | |
day_feats = {} | |
tasks = [] | |
for j in range(40): | |
tasks.append(delayed(agg_func)(X[[group[j] for group in groups_40]])) | |
res = Parallel(n_jobs=-1)(tasks) | |
for i, results in enumerate(res): | |
for (name, s) in results: | |
day_feats[f'day{i}_{name}'] = s | |
day_feats = pd.DataFrame(day_feats) | |
over_day_feats = {} | |
for func_name, f in all_aggs.items(): | |
for name in set_aggs.keys(): | |
over_day_feats[f'over_day_{func_name}_{name}'] = f(day_feats[[f'day{j}_{name}' for j in range(40)]]) | |
over_day_feats = pd.DataFrame(over_day_feats) | |
X = pd.concat([feats, over_group_feats, over_day_feats, X[groups_40[0]]], axis=1) | |
base.save_df(X, 'data/working/X.h5') | |
# X = base.load_df('data/working/X.h5') | |
train_flag = leak_x_100 != -1 | |
train_flag[:len(train)] = True | |
train_flag[len(train):] = train_flag[len(train):] & (~fake) | |
X_test = X[len(train):].astype(np.float32) | |
X = X[train_flag].astype(np.float32) | |
y = leak_x_100[train_flag].astype(np.float64) / 100. | |
y[:len(train)] = train['target'].values.astype(np.float64) | |
y = np.log1p(y) | |
# prepare for training | |
params = { | |
'objective' : 'regression', | |
'metric' : 'rmse', | |
'learning_rate' : 0.01, | |
'num_leaves' : 90, | |
'max_depth': -1, | |
'max_bin': 256, | |
'colsample_bytree': 0.6, | |
'subsample_freq' : 1, | |
'subsample' : 0.75, | |
'bagging_seed' : 2018, | |
'verbosity' : -1 | |
} | |
pred_valid = np.zeros((X.shape[0], len(kfold_seeds))) | |
pred_test = np.zeros((X_test.shape[0], len(kfold_seeds))) | |
importance_sum = np.zeros(len(groups_40)) | |
# train | |
for bag_idx, kfold_seed in enumerate(kfold_seeds): | |
kf = model_selection.KFold(5, True, kfold_seed) | |
for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(X)): | |
X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx] | |
y_train, y_valid = y[train_idx], y[valid_idx] | |
lg_train = lgb.Dataset(X_train, label=y_train) | |
lg_valid = lgb.Dataset(X_valid, label=y_valid) | |
evals_result = {} | |
model = lgb.train(params, lg_train, 1000, valid_sets=[lg_valid], early_stopping_rounds=100, verbose_eval=200, evals_result=evals_result) | |
fig, ax = plt.subplots(figsize=(12,18)) | |
lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax) | |
ax.grid(False) | |
plt.title("LightGBM - Feature Importance", fontsize=15) | |
plt.savefig(om.get_path() + f'importance_{bag_idx}_{fold_idx}.png') | |
plt.close() | |
for name, v in zip(X.columns, model.feature_importance()): | |
for j, i in enumerate(use_groups): | |
if f'group{i}_' in name: | |
importance_sum[j] += v | |
pred_valid[valid_idx, bag_idx] = model.predict(X_valid, num_iteration=model.best_iteration) | |
rmse = np.mean((y_valid - pred_valid[valid_idx, bag_idx]) ** 2) ** 0.5 | |
om.print(f'[{fold_idx}-fold] rmse: {rmse:.6f}') | |
pred_test[:, bag_idx] += model.predict(X_test, num_iteration=model.best_iteration) / 5 | |
rmse = np.mean((y - pred_valid[:, bag_idx]) ** 2) ** 0.5 | |
om.print(f'[{bag_idx}-bag] rmse: {rmse:.6f}') | |
rmse = np.mean((y - pred_valid.mean(axis=1)) ** 2) ** 0.5 | |
om.print(f'[Total] rmse: {rmse:.6f}') | |
base.save_df(pred_valid, om.get_path() + 'pred_valid.h5') | |
base.save_df(pred_test, om.get_path() + 'pred_test.h5') | |
pred_test = np.expm1(pred_test.mean(axis=1)) | |
sub_df = pd.DataFrame({'ID':test['ID'].values}) | |
sub_df['target'] = pred_test | |
sub_df.to_csv(om.get_path() + 'submission.csv', index=False) | |
leak_pos = leak_x_100[len(train):] != -1 | |
pred_test[leak_pos] = leak_x_100[len(train):][leak_pos].astype(np.float64) / 100. | |
sub_df['target'] = pred_test | |
sub_df.to_csv(om.get_path() + 'submission_with_leak.csv', index=False) | |
leaked_valid = pred_valid.mean(axis=1)[:len(train)] | |
leak_pos = leak_x_100[:len(train)] != -1 | |
leaked_valid[leak_pos] = np.log1p(leak_x_100[:len(train)][leak_pos].astype(np.float64) / 100.) | |
rmse = np.mean((y[:len(train)] - leaked_valid) ** 2) ** 0.5 | |
om.print(f'Leaked train rmse: {rmse:.6f}') | |
base.save_series(importance_sum, om.get_path() + 'importance_sum.h5') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment