Skip to content

Instantly share code, notes, and snippets.

View StevenReitsma's full-sized avatar

Steven Reitsma StevenReitsma

View GitHub Profile
@StevenReitsma
StevenReitsma / Blogpost-Heineken2.py
Created February 23, 2018 09:47
Blogpost-Heineken2
train = pd.read_csv('train.csv', parse_dates=['Date'])
train.columns = train.columns.str.lower()
features = pd.read_csv('features.csv', parse_dates=['Date'])
features.columns = features.columns.str.lower()
features[['markdown1', 'markdown2', 'markdown3','markdown4', 'markdown5']] = features[['markdown1', 'markdown2', 'markdown3','markdown4', 'markdown5']].fillna(0)
stores = pd.read_csv('stores.csv')
stores.columns = stores.columns.str.lower()
df_base = (
train
.merge(features, how='inner')
@StevenReitsma
StevenReitsma / Blogpost-Heineken3.py
Created February 23, 2018 11:41
Blogpost-Heineken3
class WeekNumberCreator(object):
def __init__(self, datecol):
self.datecol = datecol
def transform(self, X):
return X.assign(weeknumber=lambda df: df.set_index(self.datecol).index.week)
@StevenReitsma
StevenReitsma / Blogpost-Heineken4.py
Created February 23, 2018 11:42
Blogpost-Heineken4
class SmoothedVarCreator(object):
def __init__(self, var, groupby_keys, alpha_list):
self.var = var
self.groupby_keys = groupby_keys
self.alpha_list = alpha_list
def transform(self, X):
for alpha in self.alpha_list:
func = {
'%s_sm%s' % (self.var, int(alpha * 10)): lambda df:
X.groupby(self.groupby_keys)[self.var].apply(lambda x: x.ewm(alpha=alpha, min_periods=0).mean())
@StevenReitsma
StevenReitsma / Blogpost-Heineken5.py
Created February 23, 2018 11:45
Blogpost-Heineken5
class LagCreator(object):
def __init__(self, groupby_keys, lagdict, drop_cols=True, suffix=''):
self.groupby_keys = groupby_keys
self.lagdict = lagdict
self.drop_cols = drop_cols
self.suffix = suffix
def transform(self, X):
cols_to_drop = list(set([item for sublist in
self.lagdict.values() for item in sublist]))
for week in self.lagdict:
@StevenReitsma
StevenReitsma / Blogpost-Heineken6.py
Created February 23, 2018 11:47
Blogpost-Heineken6
from bdranalytics.pandaspipeline.transformers import PdFeatureChain
pipeline_pd = PdFeatureChain([
('wna', WeekNumberAdder(datecol='date')),
('svc', SmoothedVarCreator(var='weekly_sales', groupby_keys=['store', 'dept'], alpha_list=[0.1, 0.3])),
('lc1', LagCreator(groupby_keys=['store', 'dept', 'weeknumber'], lagdict=lagdict_yearly, drop_cols=False, suffix='ly')),
('lc2', LagCreator(groupby_keys=['store', 'dept'], lagdict=lagdict))
])
df_features = pipeline_pd.transform(df_base).dropna()
@StevenReitsma
StevenReitsma / Blogpost-Heineken7.py
Created February 23, 2018 11:51
Blogpost-Heineken7
from bdranalytics.model_selection.growingwindow import IntervalGrowingWindow
igw = IntervalGrowingWindow(test_start_date='2011-03-01', timestamps='index', test_size='7 days')
cv = igw.split(df_features.set_index('date'))
multiple_ts_split = list(cv)
@StevenReitsma
StevenReitsma / Blogpost-Heineken8.py
Created February 23, 2018 11:52
Blogpost-Heineken8
def force_float_df(df):
def force_float_col(x):
try:
return x.astype(float)
except:
return pd.factorize(x)[0]
return df.apply(force_float_col)
forward_sale_cols = ['weekly_sales1', 'weekly_sales2']
drop_cols = ['date']
all_drop_cols = forward_sale_cols + drop_cols
@StevenReitsma
StevenReitsma / Blogpost-Heineken9.py
Created February 23, 2018 11:54
Blogpost-Heineken9
Index(['store', 'dept', 'weeknumber', 'weekly_sales_2', 'weekly_sales_1',
'weekly_sales_sm1_1', 'weekly_sales_sm3_1', 'cpi0', 'fuel_price0',
'isholiday0', 'markdown10', 'markdown20', 'markdown30', 'markdown40',
'markdown50', 'temperature0', 'unemployment0', 'weekly_sales0',
'isholiday1', 'temperature1', 'weekly_sales_1ly1', 'isholiday2',
'temperature2', 'weekly_sales_1ly2'],
dtype='object')
@StevenReitsma
StevenReitsma / Blogpost-Heineken10.py
Created February 23, 2018 11:56
Blogpost-Heineken10
def neg_sq_loss(labels, preds):
a = -0.1
err = preds - labels
grad = 2 * err * (np.sign(err) + a)**2
hess = 2 * (np.sign(err) + a)**2
return grad, hess
@StevenReitsma
StevenReitsma / Blogpost-Heineken11.py
Created February 23, 2018 11:57
Blogpost-Heineken11
from sklearn.multioutput import MultiOutputRegressor
import lightgbm as lgb
gbm = lgb.LGBMRegressor(
objective=neg_sq_loss,
n_jobs=-1,
num_leaves=51,
learning_rate=0.02,
n_estimators=500,
categorical_feature=[0, 1, 2]
)