Skip to content

Instantly share code, notes, and snippets.

@zaburo-ch
Created August 21, 2018 11:13
Show Gist options
  • Save zaburo-ch/a410538ab271e68331565ea06731ee61 to your computer and use it in GitHub Desktop.
Save zaburo-ch/a410538ab271e68331565ea06731ee61 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from sklearn import model_selection
import lightgbm as lgb
import base
if __name__ == '__main__':
# general settings
seed = 1024
eps = 1e-8
kfold_seeds = [2017, 2016, 2015, 2014, 2013]
om = base.OutputManager([seed, eps, kfold_seeds])
np.random.seed(seed)
# train.h5 and test.h5 is raw input data saved as hdf5 format.
train = base.load_df('data/working/train.h5')
test = base.load_df('data/working/test.h5')
# `groups_40` is a list of lists of column groups.
# such like [['f190486d6', '58e2e02e6', 'eeb9cd3aa'...], ...]
# I found 88 groups.
groups_40 = base.load_pickle('groups_col_40_88.pkl')
# `fake` is a bool array based on discussion/61288, whose length is `len(test)`.
fake = base.load_series('data/working/fake.h5').values
# leak_x_100 is an int64 array which contains "leaked value x 100" or "-1".
# Because I don't want to treat the value as float, I multiply 100 and use int.
# Its length is `len(train) + len(test)`.
leak_x_100 = base.load_series('data/working/leak_x_100_int_2.h5').values
use_groups = list(range(len(groups_40))) # use all groups
groups_40 = [groups_40[i] for i in use_groups]
X = [train.drop(['ID', 'target'], axis=1), test.drop(['ID'], axis=1)]
X = pd.concat(X, axis=0, ignore_index=True, sort=False)
mask = X < eps
X = np.log1p(X)
X[mask] = np.nan
# aggregation functions for a set of features.
set_aggs = {
'mean': lambda x: x.mean(axis=1),
'var': lambda x: x.var(axis=1),
'max': lambda x: x.max(axis=1),
'quantile_90': lambda x: x.quantile(q=0.9, axis=1),
'median': lambda x: x.median(axis=1),
'quantile_10': lambda x: x.quantile(q=0.1, axis=1),
'min': lambda x: x.min(axis=1),
'nan': lambda x: x.isnull().mean(axis=1)
}
# Finally, I did not use lag features because they does not improve cv.
# def calc_lag(x):
# mask = ~x.isnull().values
# x = x.values
# non_zeros = [x[i][mask[i]] for i in range(x.shape[0])]
# lag = [s[0] - s[1] if len(s) >= 2 else np.nan for s in non_zeros]
# return lag
# def calc_lag_mean(x):
# mask = ~x.isnull().values
# x = x.values
# non_zeros = [x[i][mask[i]] for i in range(x.shape[0])]
# lag = [np.mean(s[:-1] - s[1:]) if len(s) >= 2 else np.nan for s in non_zeros]
# return lag
ema_weight1 = 0.98 ** np.arange(40)
ema_weight2 = 0.9 ** np.arange(40)
weekday_pos = np.where(np.arange(40) % 7 == 5)[0]
wema_weight = 0.98 ** np.arange(len(weekday_pos))
# aggregation functions for a sequence of features.
seq_aggs = {
'ema1': lambda x: (x * ema_weight1).sum(axis=1) / ((~x.isnull()) * ema_weight1).sum(axis=1),
'ema2': lambda x: (x * ema_weight2).sum(axis=1) / ((~x.isnull()) * ema_weight2).sum(axis=1),
'wema': lambda x: (x.iloc[:, weekday_pos] * wema_weight).sum(axis=1) / ((~x.iloc[:, weekday_pos].isnull()) * wema_weight).sum(axis=1),
# 'latest_lag': calc_lag,
# 'mean_lag': calc_lag_mean
}
all_aggs = dict(set_aggs)
all_aggs.update(seq_aggs)
def agg_func(data):
if data.shape[1] == 40:
# We can apply `seq_aggs`.
return [(name, f(data)) for name, f in all_aggs.items()]
else:
return [(name, f(data)) for name, f in set_aggs.items()]
feats = {}
res = Parallel(n_jobs=-1)([delayed(agg_func)(X[group]) for group in groups_40])
for i, results in zip(use_groups, res):
for (name, s) in results:
feats[f'group{i}_{name}'] = s
feats = pd.DataFrame(feats)
over_group_feats = {}
for func_name, f in set_aggs.items():
for name in all_aggs.keys():
sub = feats[[col for col in feats.columns if col.endswith('_' + name)]]
over_group_feats[f'over_group_{func_name}_{name}'] = f(sub)
over_group_feats = pd.DataFrame(over_group_feats)
day_feats = {}
tasks = []
for j in range(40):
tasks.append(delayed(agg_func)(X[[group[j] for group in groups_40]]))
res = Parallel(n_jobs=-1)(tasks)
for i, results in enumerate(res):
for (name, s) in results:
day_feats[f'day{i}_{name}'] = s
day_feats = pd.DataFrame(day_feats)
over_day_feats = {}
for func_name, f in all_aggs.items():
for name in set_aggs.keys():
over_day_feats[f'over_day_{func_name}_{name}'] = f(day_feats[[f'day{j}_{name}' for j in range(40)]])
over_day_feats = pd.DataFrame(over_day_feats)
X = pd.concat([feats, over_group_feats, over_day_feats, X[groups_40[0]]], axis=1)
base.save_df(X, 'data/working/X.h5')
# X = base.load_df('data/working/X.h5')
train_flag = leak_x_100 != -1
train_flag[:len(train)] = True
train_flag[len(train):] = train_flag[len(train):] & (~fake)
X_test = X[len(train):].astype(np.float32)
X = X[train_flag].astype(np.float32)
y = leak_x_100[train_flag].astype(np.float64) / 100.
y[:len(train)] = train['target'].values.astype(np.float64)
y = np.log1p(y)
# prepare for training
params = {
'objective' : 'regression',
'metric' : 'rmse',
'learning_rate' : 0.01,
'num_leaves' : 90,
'max_depth': -1,
'max_bin': 256,
'colsample_bytree': 0.6,
'subsample_freq' : 1,
'subsample' : 0.75,
'bagging_seed' : 2018,
'verbosity' : -1
}
pred_valid = np.zeros((X.shape[0], len(kfold_seeds)))
pred_test = np.zeros((X_test.shape[0], len(kfold_seeds)))
importance_sum = np.zeros(len(groups_40))
# train
for bag_idx, kfold_seed in enumerate(kfold_seeds):
kf = model_selection.KFold(5, True, kfold_seed)
for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(X)):
X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
y_train, y_valid = y[train_idx], y[valid_idx]
lg_train = lgb.Dataset(X_train, label=y_train)
lg_valid = lgb.Dataset(X_valid, label=y_valid)
evals_result = {}
model = lgb.train(params, lg_train, 1000, valid_sets=[lg_valid], early_stopping_rounds=100, verbose_eval=200, evals_result=evals_result)
fig, ax = plt.subplots(figsize=(12,18))
lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.savefig(om.get_path() + f'importance_{bag_idx}_{fold_idx}.png')
plt.close()
for name, v in zip(X.columns, model.feature_importance()):
for j, i in enumerate(use_groups):
if f'group{i}_' in name:
importance_sum[j] += v
pred_valid[valid_idx, bag_idx] = model.predict(X_valid, num_iteration=model.best_iteration)
rmse = np.mean((y_valid - pred_valid[valid_idx, bag_idx]) ** 2) ** 0.5
om.print(f'[{fold_idx}-fold] rmse: {rmse:.6f}')
pred_test[:, bag_idx] += model.predict(X_test, num_iteration=model.best_iteration) / 5
rmse = np.mean((y - pred_valid[:, bag_idx]) ** 2) ** 0.5
om.print(f'[{bag_idx}-bag] rmse: {rmse:.6f}')
rmse = np.mean((y - pred_valid.mean(axis=1)) ** 2) ** 0.5
om.print(f'[Total] rmse: {rmse:.6f}')
base.save_df(pred_valid, om.get_path() + 'pred_valid.h5')
base.save_df(pred_test, om.get_path() + 'pred_test.h5')
pred_test = np.expm1(pred_test.mean(axis=1))
sub_df = pd.DataFrame({'ID':test['ID'].values})
sub_df['target'] = pred_test
sub_df.to_csv(om.get_path() + 'submission.csv', index=False)
leak_pos = leak_x_100[len(train):] != -1
pred_test[leak_pos] = leak_x_100[len(train):][leak_pos].astype(np.float64) / 100.
sub_df['target'] = pred_test
sub_df.to_csv(om.get_path() + 'submission_with_leak.csv', index=False)
leaked_valid = pred_valid.mean(axis=1)[:len(train)]
leak_pos = leak_x_100[:len(train)] != -1
leaked_valid[leak_pos] = np.log1p(leak_x_100[:len(train)][leak_pos].astype(np.float64) / 100.)
rmse = np.mean((y[:len(train)] - leaked_valid) ** 2) ** 0.5
om.print(f'Leaked train rmse: {rmse:.6f}')
base.save_series(importance_sum, om.get_path() + 'importance_sum.h5')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment