Created
November 16, 2017 14:14
-
-
Save zaburo-ch/d35491909a752f39c2146a35494daa6d to your computer and use it in GitHub Desktop.
kaggle web-traffic-time-series-forecasting の solution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
np.random.seed(1024) | |
import pandas as pd | |
import chainer | |
from chainer import serializers | |
from chainer.optimizers import Adam | |
import chainer.functions as F | |
import chainer.links as L | |
from chainer import reporter | |
from chainer.datasets import TupleDataset | |
from chainer import training | |
from chainer.training import extensions | |
from chainer.dataset import concat_examples | |
import base | |
import argparse | |
from collections import OrderedDict | |
import re | |
import six | |
mu = 4.4126225 | |
sigma = 2.4928892 | |
eps = 1e-5 | |
class MLP(chainer.Chain): | |
def __init__(self, in_size, hidden_size, out_size, large=False): | |
super(MLP, self).__init__() | |
self.large = large | |
with self.init_scope(): | |
if large: | |
self.l1 = L.Linear(in_size, hidden_size) | |
self.bn1 = L.BatchNormalization(hidden_size) | |
self.l2 = L.Linear(hidden_size, hidden_size) | |
self.bn2 = L.BatchNormalization(hidden_size) | |
self.l3 = L.Linear(hidden_size, hidden_size) | |
self.bn3 = L.BatchNormalization(hidden_size) | |
self.l4 = L.Linear(hidden_size, out_size) | |
else: | |
self.l1 = L.Linear(in_size, hidden_size) | |
self.l2 = L.Linear(hidden_size, hidden_size) | |
self.l3 = L.Linear(hidden_size, out_size) | |
def predict(self, x): | |
if self.large: | |
x = self.l1(x) | |
x = self.bn1(x) | |
x = F.relu(x) | |
x = self.l2(x) | |
x = self.bn2(x) | |
x = F.relu(x) | |
x = self.l3(x) | |
x = self.bn3(x) | |
x = F.relu(x) | |
x = self.l4(x) | |
else: | |
x = self.l1(x) | |
x = F.relu(x) | |
x = self.l2(x) | |
x = F.relu(x) | |
x = self.l3(x) | |
return F.softmax(x) | |
def __call__(self, x, d): | |
y_pred = self.predict(x) | |
approx_smape = F.sum(y_pred * d, axis=1) | |
loss = F.mean(approx_smape) | |
reporter.report({'loss': loss, 'approx_smape': approx_smape}, self) | |
return loss | |
class PageViewDataset: | |
def __init__(self, purpose='valid', data_type='train', gap=0): | |
gap = 2 | |
need_y = (purpose == 'valid' or data_type == 'train') | |
if purpose == 'valid': | |
data = pd.read_hdf(base.WORKING_DIR + f'train_1.h5', 'tables') | |
y_start = 440 | |
elif purpose == 'test': | |
data = pd.read_hdf(base.WORKING_DIR + f'train_2.h5', 'tables') | |
y_start = 805 | |
else: | |
raise NotImplementedError | |
if data_type == 'train': | |
y_start -= 62 + gap | |
self.date_str = pd.Series(data.columns[1:]) | |
self.date = pd.to_datetime(pd.Series(data.columns[1:])) | |
self.page = data['Page'] | |
data = data.drop('Page', axis=1) | |
data = data.values.astype(np.float32) | |
if need_y: | |
task_feats, date_feats, y = self.get_features(data, purpose, y_start, gap, True) | |
def get_smape_each_feat(y_pred, y_true): | |
y_true = y_true[..., None] | |
smape = 2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + y_true + eps) | |
return smape | |
y = y.reshape(-1) | |
self.y = y | |
date_feats = date_feats.reshape(-1, date_feats.shape[2]) | |
smape = get_smape_each_feat(date_feats, y) | |
else: | |
task_feats, date_feats = self.get_features(data, purpose, y_start, gap, False) | |
date_feats = date_feats.reshape(-1, date_feats.shape[2]) | |
self.date_feats = date_feats | |
task_feats = np.tile(task_feats[:, None, :], (1, 62, 1)) | |
task_date_feats = np.zeros((task_feats.shape[0], 62, 2)) | |
task_date_feats[:, :, 0] = self.week_of_y / 6 | |
task_date_feats[:, :, 1] = np.arange(62) / 61 | |
total_feats = np.concatenate([task_feats, task_date_feats], axis=2) | |
total_feats = total_feats.reshape(-1, total_feats.shape[2]) | |
total_feats = total_feats.astype(np.float32) | |
if need_y: | |
self._datasets = (total_feats, smape) | |
else: | |
self._datasets = (total_feats,) | |
self._length = len(self._datasets[0]) | |
def get_features(self, data, purpose, y_start, gap, return_y=True): | |
n = data.shape[0] | |
x_stop = y_start - gap | |
nan_count = np.mean(~np.isfinite(data[:, x_stop - 7:x_stop]), axis=1) | |
data[~np.isfinite(data)] = 0 # destructive assignment !!!! | |
zero_count = np.mean(data[:, x_stop - 7:x_stop] == 0, axis=1) | |
if return_y: | |
y = data[:, y_start:y_start + 62] | |
date_feat_num = 17 | |
date_feats = np.empty((n, 62, date_feat_num), dtype=np.float32) | |
# weakly median | |
def weekly_median(week_num): | |
term = data[:, x_stop - (7 * week_num):x_stop] | |
med = np.median(term.reshape(n, week_num, 7), axis=1) | |
return np.tile(med, 10)[:, gap % 7:gap % 7 + 62] | |
date_feats[:, :, 0] = weekly_median(1) | |
date_feats[:, :, 1] = weekly_median(2) | |
date_feats[:, :, 2] = weekly_median(4) | |
date_feats[:, :, 3] = weekly_median(8) | |
# Median of weekly Median | |
date_feats[:, :, 4] = np.median(date_feats[:, :, 0:2], axis=2) | |
date_feats[:, :, 5] = np.median(date_feats[:, :, 0:4], axis=2) | |
# # auto reg | |
# date_feats[:, :, 4] = date_feats[:, :, 0] - date_feats[:, :, 1] | |
# date_feats[:, :, 5] = date_feats[:, :, 0] - date_feats[:, :, 3] | |
# last year | |
one_year_back = 366 if purpose == 'valid' else 365 | |
date_feats[:, :, 6] = data[:, y_start - one_year_back:y_start - one_year_back + 62] | |
# dayofweek of self.date[y_start - 77:y_start + 62 - 77] equals | |
# to that of self.date[y_start:y_start + 62] | |
self.week_of_y = self.date[y_start - 77:y_start + 62 - 77].dt.dayofweek | |
# weekend or weekday | |
def assign_weekend_or_weekday(i, term_length): | |
term = data[:, x_stop - term_length:x_stop] | |
week_of_term = self.date[x_stop - term_length:x_stop].dt.dayofweek | |
date_feats[:, self.week_of_y >= 5, i] = np.median(term[:, np.where(week_of_term >= 5)[0]], axis=1)[:, None] | |
date_feats[:, self.week_of_y < 5, i] = np.median(term[:, np.where(week_of_term < 5)[0]], axis=1)[:, None] | |
# define the Windows according to Ehsan's kernel | |
r = 1.61803398875 | |
windows = np.round(r ** np.arange(0, 9) * 7).astype(int) | |
for i, w in enumerate(windows): | |
assign_weekend_or_weekday(i + 7, w) | |
# Median of Median | |
date_feats[:, :, 16] = np.median(date_feats[:, :, 7:16], axis=2) | |
# standarize for task feats | |
data = (np.log1p(data) - mu) / sigma | |
task_feat_num = 16 | |
task_feats = np.empty((n, task_feat_num), dtype=np.float32) | |
# count feats | |
task_feats[:, 0] = nan_count | |
task_feats[:, 1] = zero_count | |
# short term volatility | |
task_feats[:, 2] = np.std(data[:, x_stop - 7:x_stop], axis=1) | |
# lastest diff | |
task_feats[:, 3] = data[:, x_stop - 1] - data[:, x_stop - 2] | |
# median | |
task_feats[:, 4] = np.median(data[:, x_stop - 7:x_stop], axis=1) | |
task_feats[:, 5] = np.median(data[:, x_stop - 30:x_stop], axis=1) | |
task_feats[:, 6] = np.median(data[:, x_stop - 60:x_stop], axis=1) | |
# 90 percentile | |
task_feats[:, 7] = np.percentile(data[:, x_stop - 7:x_stop], 90, axis=1) | |
task_feats[:, 8] = np.percentile(data[:, x_stop - 30:x_stop], 90, axis=1) | |
task_feats[:, 9] = np.percentile(data[:, x_stop - 60:x_stop], 90, axis=1) | |
# auto reg | |
task_feats[:, 10] = task_feats[:, 4] - task_feats[:, 5] | |
task_feats[:, 11] = task_feats[:, 4] - task_feats[:, 6] | |
# argmax pos | |
task_feats[:, 12] = np.argmax(data[:, x_stop - 30:x_stop], axis=1) / 29 | |
task_feats[:, 13] = np.argmax(data[:, x_stop - 60:x_stop], axis=1) / 59 | |
# diff between max and lastest | |
task_feats[:, 14] = data[:, x_stop - 1] - np.max(data[:, x_stop - 30:x_stop], axis=1) | |
task_feats[:, 15] = data[:, x_stop - 1] - np.max(data[:, x_stop - 60:x_stop], axis=1) | |
task_dummy_feats = {} | |
pat = re.compile(r'(.*)_([^.]+)\.[^.]+.org_(.*)_(.*)') | |
splits = self.page.map(lambda x: pat.match(x).groups()).tolist() | |
splits = pd.DataFrame(splits, columns=['name', 'country', 'access', 'agent']) | |
def add_dummies(prefix): | |
df = pd.get_dummies(splits[prefix], prefix=prefix) | |
for col in df.columns: | |
task_dummy_feats[col] = df[col] | |
add_dummies('country') | |
add_dummies('access') | |
add_dummies('agent') | |
task_feats = np.concatenate([task_feats, pd.DataFrame(task_dummy_feats).values], axis=1) | |
if return_y: | |
return task_feats, date_feats, y | |
else: | |
return task_feats, date_feats | |
def __getitem__(self, index): | |
batches = [dataset[index] for dataset in self._datasets] | |
if isinstance(index, slice): | |
length = len(batches[0]) | |
return [tuple([batch[i] for batch in batches]) | |
for i in six.moves.range(length)] | |
else: | |
return tuple(batches) | |
def __len__(self): | |
return self._length | |
if __name__ == '__main__': | |
chainer.set_debug(True) | |
chainer.config.meta_train = True | |
# TODO : write argparse description | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-batch_size', default=256, type=int) | |
parser.add_argument('-n_iter', default=100, type=int) | |
parser.add_argument('-valid_interval', default=1, type=int) | |
parser.add_argument('-valid_batch_size', default=1024, type=int) | |
parser.add_argument('-save_interval', default=1, type=int) | |
parser.add_argument('-gpu', default=-1, type=int) | |
parser.add_argument('-large', action='store_true') | |
parser.add_argument('-description', default='no description') | |
parser.add_argument('-purpose', default='valid') | |
args = parser.parse_args() | |
om = base.OutputManager(vars(args)) | |
train = PageViewDataset(args.purpose, 'train') | |
valid = PageViewDataset(args.purpose, 'test') | |
model = MLP(train._datasets[0].shape[1], 256, train._datasets[1].shape[1], args.large) | |
# transfor model to gpu | |
if args.gpu >= 0: | |
chainer.cuda.get_device_from_id(args.gpu).use() | |
model.to_gpu(args.gpu) | |
# chainer.cuda.to_gpu(train._datasets[0], args.gpu) | |
# chainer.cuda.to_gpu(train._datasets[1], args.gpu) | |
optimizer = Adam() | |
optimizer.setup(model) | |
train_iter = chainer.iterators.SerialIterator(train, args.batch_size, repeat=True, shuffle=True) | |
valid_iter = chainer.iterators.SerialIterator(valid, args.valid_batch_size, repeat=False, shuffle=False) | |
updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) | |
trainer = training.Trainer(updater, (args.n_iter, 'epoch'), out=om.get_path()) | |
trainer.extend(extensions.dump_graph('main/loss')) | |
trainer.extend(extensions.snapshot(), trigger=(args.save_interval, 'epoch')) | |
trainer.extend(extensions.LogReport()) | |
trainer.extend(extensions.ProgressBar()) | |
if args.purpose == 'valid': | |
trainer.extend(extensions.Evaluator(valid_iter, model, device=args.gpu), | |
trigger=(args.valid_interval, 'epoch')) | |
trainer.extend(extensions.PrintReport( | |
['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time'])) | |
@training.make_extension(trigger=training.triggers.MinValueTrigger( | |
'validation/main/loss', trigger=(args.valid_interval, 'epoch')), priority=-100) | |
def save_base_model(trainer): | |
print('save best') | |
serializers.save_npz(om.get_path() + 'best.model', model) | |
trainer.extend(save_base_model) | |
else: | |
trainer.extend(extensions.PrintReport( | |
['epoch', 'main/loss', 'elapsed_time'])) | |
trainer.run() | |
if args.purpose == 'valid': | |
valid_iter.reset() | |
pred_valid = [] | |
with chainer.no_backprop_mode(): | |
for batch in valid_iter: | |
x, _ = concat_examples(batch, args.gpu) | |
pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data)) | |
pred_valid = np.concatenate(pred_valid, axis=0) | |
y = valid.y | |
date_feats = valid.date_feats | |
pred1 = (date_feats * pred_valid).sum(axis=1) | |
pred2 = np.round(pred1) | |
pred3 = np.zeros(pred_valid.shape[0]) | |
for i in range(pred_valid.shape[0]): | |
pred3[i] = date_feats[i, np.argmax(pred_valid[i])] | |
print(1, base.SMAPE(pred1, y)) | |
print(2, base.SMAPE(pred2, y)) | |
print(3, base.SMAPE(pred3, y)) | |
serializers.load_npz(om.get_path() + 'best.model', model) | |
valid_iter.reset() | |
pred_valid = [] | |
with chainer.no_backprop_mode(): | |
for batch in valid_iter: | |
x, _ = concat_examples(batch, args.gpu) | |
pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data)) | |
pred_valid = np.concatenate(pred_valid, axis=0) | |
y = valid.y | |
date_feats = valid.date_feats | |
pred1 = (date_feats * pred_valid).sum(axis=1) | |
pred2 = np.round(pred1) | |
pred3 = np.zeros(pred_valid.shape[0]) | |
for i in range(pred_valid.shape[0]): | |
pred3[i] = date_feats[i, np.argmax(pred_valid[i])] | |
print(1, base.SMAPE(pred1, y)) | |
print(2, base.SMAPE(pred2, y)) | |
print(3, base.SMAPE(pred3, y)) | |
else: | |
valid_iter.reset() | |
pred_valid = [] | |
with chainer.no_backprop_mode(): | |
for batch in valid_iter: | |
x, _ = concat_examples(batch, args.gpu) | |
pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data)) | |
pred_valid = np.concatenate(pred_valid, axis=0) | |
date_feats = valid.date_feats | |
pred = (date_feats * pred_valid).sum(axis=1) | |
pred = np.round(pred) | |
# pred = np.zeros(pred_valid.shape[0]) | |
# for i in range(pred_valid.shape[0]): | |
# pred[i] = date_feats[i, np.argmax(pred_valid[i])] | |
pred = pred.reshape((-1, 62)) | |
assert len(pred) == len(valid.page) | |
pred_df = pd.DataFrame(pred, columns=pd.date_range('2017-09-13', '2017-11-13'), index=valid.page) | |
pred_df = pred_df.reset_index() | |
pred_df.to_hdf(om.get_path() + 'pred_df.h5', 'tables', complevel=9, complib='blosc') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment