Skip to content

Instantly share code, notes, and snippets.

@zaburo-ch
Created November 16, 2017 14:14
Show Gist options
  • Save zaburo-ch/d35491909a752f39c2146a35494daa6d to your computer and use it in GitHub Desktop.
Save zaburo-ch/d35491909a752f39c2146a35494daa6d to your computer and use it in GitHub Desktop.
kaggle web-traffic-time-series-forecasting の solution
import numpy as np
np.random.seed(1024)
import pandas as pd
import chainer
from chainer import serializers
from chainer.optimizers import Adam
import chainer.functions as F
import chainer.links as L
from chainer import reporter
from chainer.datasets import TupleDataset
from chainer import training
from chainer.training import extensions
from chainer.dataset import concat_examples
import base
import argparse
from collections import OrderedDict
import re
import six
mu = 4.4126225
sigma = 2.4928892
eps = 1e-5
class MLP(chainer.Chain):
def __init__(self, in_size, hidden_size, out_size, large=False):
super(MLP, self).__init__()
self.large = large
with self.init_scope():
if large:
self.l1 = L.Linear(in_size, hidden_size)
self.bn1 = L.BatchNormalization(hidden_size)
self.l2 = L.Linear(hidden_size, hidden_size)
self.bn2 = L.BatchNormalization(hidden_size)
self.l3 = L.Linear(hidden_size, hidden_size)
self.bn3 = L.BatchNormalization(hidden_size)
self.l4 = L.Linear(hidden_size, out_size)
else:
self.l1 = L.Linear(in_size, hidden_size)
self.l2 = L.Linear(hidden_size, hidden_size)
self.l3 = L.Linear(hidden_size, out_size)
def predict(self, x):
if self.large:
x = self.l1(x)
x = self.bn1(x)
x = F.relu(x)
x = self.l2(x)
x = self.bn2(x)
x = F.relu(x)
x = self.l3(x)
x = self.bn3(x)
x = F.relu(x)
x = self.l4(x)
else:
x = self.l1(x)
x = F.relu(x)
x = self.l2(x)
x = F.relu(x)
x = self.l3(x)
return F.softmax(x)
def __call__(self, x, d):
y_pred = self.predict(x)
approx_smape = F.sum(y_pred * d, axis=1)
loss = F.mean(approx_smape)
reporter.report({'loss': loss, 'approx_smape': approx_smape}, self)
return loss
class PageViewDataset:
def __init__(self, purpose='valid', data_type='train', gap=0):
gap = 2
need_y = (purpose == 'valid' or data_type == 'train')
if purpose == 'valid':
data = pd.read_hdf(base.WORKING_DIR + f'train_1.h5', 'tables')
y_start = 440
elif purpose == 'test':
data = pd.read_hdf(base.WORKING_DIR + f'train_2.h5', 'tables')
y_start = 805
else:
raise NotImplementedError
if data_type == 'train':
y_start -= 62 + gap
self.date_str = pd.Series(data.columns[1:])
self.date = pd.to_datetime(pd.Series(data.columns[1:]))
self.page = data['Page']
data = data.drop('Page', axis=1)
data = data.values.astype(np.float32)
if need_y:
task_feats, date_feats, y = self.get_features(data, purpose, y_start, gap, True)
def get_smape_each_feat(y_pred, y_true):
y_true = y_true[..., None]
smape = 2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + y_true + eps)
return smape
y = y.reshape(-1)
self.y = y
date_feats = date_feats.reshape(-1, date_feats.shape[2])
smape = get_smape_each_feat(date_feats, y)
else:
task_feats, date_feats = self.get_features(data, purpose, y_start, gap, False)
date_feats = date_feats.reshape(-1, date_feats.shape[2])
self.date_feats = date_feats
task_feats = np.tile(task_feats[:, None, :], (1, 62, 1))
task_date_feats = np.zeros((task_feats.shape[0], 62, 2))
task_date_feats[:, :, 0] = self.week_of_y / 6
task_date_feats[:, :, 1] = np.arange(62) / 61
total_feats = np.concatenate([task_feats, task_date_feats], axis=2)
total_feats = total_feats.reshape(-1, total_feats.shape[2])
total_feats = total_feats.astype(np.float32)
if need_y:
self._datasets = (total_feats, smape)
else:
self._datasets = (total_feats,)
self._length = len(self._datasets[0])
def get_features(self, data, purpose, y_start, gap, return_y=True):
n = data.shape[0]
x_stop = y_start - gap
nan_count = np.mean(~np.isfinite(data[:, x_stop - 7:x_stop]), axis=1)
data[~np.isfinite(data)] = 0 # destructive assignment !!!!
zero_count = np.mean(data[:, x_stop - 7:x_stop] == 0, axis=1)
if return_y:
y = data[:, y_start:y_start + 62]
date_feat_num = 17
date_feats = np.empty((n, 62, date_feat_num), dtype=np.float32)
# weakly median
def weekly_median(week_num):
term = data[:, x_stop - (7 * week_num):x_stop]
med = np.median(term.reshape(n, week_num, 7), axis=1)
return np.tile(med, 10)[:, gap % 7:gap % 7 + 62]
date_feats[:, :, 0] = weekly_median(1)
date_feats[:, :, 1] = weekly_median(2)
date_feats[:, :, 2] = weekly_median(4)
date_feats[:, :, 3] = weekly_median(8)
# Median of weekly Median
date_feats[:, :, 4] = np.median(date_feats[:, :, 0:2], axis=2)
date_feats[:, :, 5] = np.median(date_feats[:, :, 0:4], axis=2)
# # auto reg
# date_feats[:, :, 4] = date_feats[:, :, 0] - date_feats[:, :, 1]
# date_feats[:, :, 5] = date_feats[:, :, 0] - date_feats[:, :, 3]
# last year
one_year_back = 366 if purpose == 'valid' else 365
date_feats[:, :, 6] = data[:, y_start - one_year_back:y_start - one_year_back + 62]
# dayofweek of self.date[y_start - 77:y_start + 62 - 77] equals
# to that of self.date[y_start:y_start + 62]
self.week_of_y = self.date[y_start - 77:y_start + 62 - 77].dt.dayofweek
# weekend or weekday
def assign_weekend_or_weekday(i, term_length):
term = data[:, x_stop - term_length:x_stop]
week_of_term = self.date[x_stop - term_length:x_stop].dt.dayofweek
date_feats[:, self.week_of_y >= 5, i] = np.median(term[:, np.where(week_of_term >= 5)[0]], axis=1)[:, None]
date_feats[:, self.week_of_y < 5, i] = np.median(term[:, np.where(week_of_term < 5)[0]], axis=1)[:, None]
# define the Windows according to Ehsan's kernel
r = 1.61803398875
windows = np.round(r ** np.arange(0, 9) * 7).astype(int)
for i, w in enumerate(windows):
assign_weekend_or_weekday(i + 7, w)
# Median of Median
date_feats[:, :, 16] = np.median(date_feats[:, :, 7:16], axis=2)
# standarize for task feats
data = (np.log1p(data) - mu) / sigma
task_feat_num = 16
task_feats = np.empty((n, task_feat_num), dtype=np.float32)
# count feats
task_feats[:, 0] = nan_count
task_feats[:, 1] = zero_count
# short term volatility
task_feats[:, 2] = np.std(data[:, x_stop - 7:x_stop], axis=1)
# lastest diff
task_feats[:, 3] = data[:, x_stop - 1] - data[:, x_stop - 2]
# median
task_feats[:, 4] = np.median(data[:, x_stop - 7:x_stop], axis=1)
task_feats[:, 5] = np.median(data[:, x_stop - 30:x_stop], axis=1)
task_feats[:, 6] = np.median(data[:, x_stop - 60:x_stop], axis=1)
# 90 percentile
task_feats[:, 7] = np.percentile(data[:, x_stop - 7:x_stop], 90, axis=1)
task_feats[:, 8] = np.percentile(data[:, x_stop - 30:x_stop], 90, axis=1)
task_feats[:, 9] = np.percentile(data[:, x_stop - 60:x_stop], 90, axis=1)
# auto reg
task_feats[:, 10] = task_feats[:, 4] - task_feats[:, 5]
task_feats[:, 11] = task_feats[:, 4] - task_feats[:, 6]
# argmax pos
task_feats[:, 12] = np.argmax(data[:, x_stop - 30:x_stop], axis=1) / 29
task_feats[:, 13] = np.argmax(data[:, x_stop - 60:x_stop], axis=1) / 59
# diff between max and lastest
task_feats[:, 14] = data[:, x_stop - 1] - np.max(data[:, x_stop - 30:x_stop], axis=1)
task_feats[:, 15] = data[:, x_stop - 1] - np.max(data[:, x_stop - 60:x_stop], axis=1)
task_dummy_feats = {}
pat = re.compile(r'(.*)_([^.]+)\.[^.]+.org_(.*)_(.*)')
splits = self.page.map(lambda x: pat.match(x).groups()).tolist()
splits = pd.DataFrame(splits, columns=['name', 'country', 'access', 'agent'])
def add_dummies(prefix):
df = pd.get_dummies(splits[prefix], prefix=prefix)
for col in df.columns:
task_dummy_feats[col] = df[col]
add_dummies('country')
add_dummies('access')
add_dummies('agent')
task_feats = np.concatenate([task_feats, pd.DataFrame(task_dummy_feats).values], axis=1)
if return_y:
return task_feats, date_feats, y
else:
return task_feats, date_feats
def __getitem__(self, index):
batches = [dataset[index] for dataset in self._datasets]
if isinstance(index, slice):
length = len(batches[0])
return [tuple([batch[i] for batch in batches])
for i in six.moves.range(length)]
else:
return tuple(batches)
def __len__(self):
return self._length
if __name__ == '__main__':
chainer.set_debug(True)
chainer.config.meta_train = True
# TODO : write argparse description
parser = argparse.ArgumentParser()
parser.add_argument('-batch_size', default=256, type=int)
parser.add_argument('-n_iter', default=100, type=int)
parser.add_argument('-valid_interval', default=1, type=int)
parser.add_argument('-valid_batch_size', default=1024, type=int)
parser.add_argument('-save_interval', default=1, type=int)
parser.add_argument('-gpu', default=-1, type=int)
parser.add_argument('-large', action='store_true')
parser.add_argument('-description', default='no description')
parser.add_argument('-purpose', default='valid')
args = parser.parse_args()
om = base.OutputManager(vars(args))
train = PageViewDataset(args.purpose, 'train')
valid = PageViewDataset(args.purpose, 'test')
model = MLP(train._datasets[0].shape[1], 256, train._datasets[1].shape[1], args.large)
# transfor model to gpu
if args.gpu >= 0:
chainer.cuda.get_device_from_id(args.gpu).use()
model.to_gpu(args.gpu)
# chainer.cuda.to_gpu(train._datasets[0], args.gpu)
# chainer.cuda.to_gpu(train._datasets[1], args.gpu)
optimizer = Adam()
optimizer.setup(model)
train_iter = chainer.iterators.SerialIterator(train, args.batch_size, repeat=True, shuffle=True)
valid_iter = chainer.iterators.SerialIterator(valid, args.valid_batch_size, repeat=False, shuffle=False)
updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
trainer = training.Trainer(updater, (args.n_iter, 'epoch'), out=om.get_path())
trainer.extend(extensions.dump_graph('main/loss'))
trainer.extend(extensions.snapshot(), trigger=(args.save_interval, 'epoch'))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.ProgressBar())
if args.purpose == 'valid':
trainer.extend(extensions.Evaluator(valid_iter, model, device=args.gpu),
trigger=(args.valid_interval, 'epoch'))
trainer.extend(extensions.PrintReport(
['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time']))
@training.make_extension(trigger=training.triggers.MinValueTrigger(
'validation/main/loss', trigger=(args.valid_interval, 'epoch')), priority=-100)
def save_base_model(trainer):
print('save best')
serializers.save_npz(om.get_path() + 'best.model', model)
trainer.extend(save_base_model)
else:
trainer.extend(extensions.PrintReport(
['epoch', 'main/loss', 'elapsed_time']))
trainer.run()
if args.purpose == 'valid':
valid_iter.reset()
pred_valid = []
with chainer.no_backprop_mode():
for batch in valid_iter:
x, _ = concat_examples(batch, args.gpu)
pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
pred_valid = np.concatenate(pred_valid, axis=0)
y = valid.y
date_feats = valid.date_feats
pred1 = (date_feats * pred_valid).sum(axis=1)
pred2 = np.round(pred1)
pred3 = np.zeros(pred_valid.shape[0])
for i in range(pred_valid.shape[0]):
pred3[i] = date_feats[i, np.argmax(pred_valid[i])]
print(1, base.SMAPE(pred1, y))
print(2, base.SMAPE(pred2, y))
print(3, base.SMAPE(pred3, y))
serializers.load_npz(om.get_path() + 'best.model', model)
valid_iter.reset()
pred_valid = []
with chainer.no_backprop_mode():
for batch in valid_iter:
x, _ = concat_examples(batch, args.gpu)
pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
pred_valid = np.concatenate(pred_valid, axis=0)
y = valid.y
date_feats = valid.date_feats
pred1 = (date_feats * pred_valid).sum(axis=1)
pred2 = np.round(pred1)
pred3 = np.zeros(pred_valid.shape[0])
for i in range(pred_valid.shape[0]):
pred3[i] = date_feats[i, np.argmax(pred_valid[i])]
print(1, base.SMAPE(pred1, y))
print(2, base.SMAPE(pred2, y))
print(3, base.SMAPE(pred3, y))
else:
valid_iter.reset()
pred_valid = []
with chainer.no_backprop_mode():
for batch in valid_iter:
x, _ = concat_examples(batch, args.gpu)
pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
pred_valid = np.concatenate(pred_valid, axis=0)
date_feats = valid.date_feats
pred = (date_feats * pred_valid).sum(axis=1)
pred = np.round(pred)
# pred = np.zeros(pred_valid.shape[0])
# for i in range(pred_valid.shape[0]):
# pred[i] = date_feats[i, np.argmax(pred_valid[i])]
pred = pred.reshape((-1, 62))
assert len(pred) == len(valid.page)
pred_df = pd.DataFrame(pred, columns=pd.date_range('2017-09-13', '2017-11-13'), index=valid.page)
pred_df = pred_df.reset_index()
pred_df.to_hdf(om.get_path() + 'pred_df.h5', 'tables', complevel=9, complib='blosc')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment