This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train = pd.read_csv('train.csv', parse_dates=['Date']) | |
train.columns = train.columns.str.lower() | |
features = pd.read_csv('features.csv', parse_dates=['Date']) | |
features.columns = features.columns.str.lower() | |
features[['markdown1', 'markdown2', 'markdown3','markdown4', 'markdown5']] = features[['markdown1', 'markdown2', 'markdown3','markdown4', 'markdown5']].fillna(0) | |
stores = pd.read_csv('stores.csv') | |
stores.columns = stores.columns.str.lower() | |
df_base = ( | |
train | |
.merge(features, how='inner') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class WeekNumberCreator(object): | |
def __init__(self, datecol): | |
self.datecol = datecol | |
def transform(self, X): | |
return X.assign(weeknumber=lambda df: df.set_index(self.datecol).index.week) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class SmoothedVarCreator(object): | |
def __init__(self, var, groupby_keys, alpha_list): | |
self.var = var | |
self.groupby_keys = groupby_keys | |
self.alpha_list = alpha_list | |
def transform(self, X): | |
for alpha in self.alpha_list: | |
func = { | |
'%s_sm%s' % (self.var, int(alpha * 10)): lambda df: | |
X.groupby(self.groupby_keys)[self.var].apply(lambda x: x.ewm(alpha=alpha, min_periods=0).mean()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class LagCreator(object): | |
def __init__(self, groupby_keys, lagdict, drop_cols=True, suffix=''): | |
self.groupby_keys = groupby_keys | |
self.lagdict = lagdict | |
self.drop_cols = drop_cols | |
self.suffix = suffix | |
def transform(self, X): | |
cols_to_drop = list(set([item for sublist in | |
self.lagdict.values() for item in sublist])) | |
for week in self.lagdict: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bdranalytics.pandaspipeline.transformers import PdFeatureChain | |
pipeline_pd = PdFeatureChain([ | |
('wna', WeekNumberAdder(datecol='date')), | |
('svc', SmoothedVarCreator(var='weekly_sales', groupby_keys=['store', 'dept'], alpha_list=[0.1, 0.3])), | |
('lc1', LagCreator(groupby_keys=['store', 'dept', 'weeknumber'], lagdict=lagdict_yearly, drop_cols=False, suffix='ly')), | |
('lc2', LagCreator(groupby_keys=['store', 'dept'], lagdict=lagdict)) | |
]) | |
df_features = pipeline_pd.transform(df_base).dropna() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bdranalytics.model_selection.growingwindow import IntervalGrowingWindow | |
igw = IntervalGrowingWindow(test_start_date='2011-03-01', timestamps='index', test_size='7 days') | |
cv = igw.split(df_features.set_index('date')) | |
multiple_ts_split = list(cv) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def force_float_df(df): | |
def force_float_col(x): | |
try: | |
return x.astype(float) | |
except: | |
return pd.factorize(x)[0] | |
return df.apply(force_float_col) | |
forward_sale_cols = ['weekly_sales1', 'weekly_sales2'] | |
drop_cols = ['date'] | |
all_drop_cols = forward_sale_cols + drop_cols |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Index(['store', 'dept', 'weeknumber', 'weekly_sales_2', 'weekly_sales_1', | |
'weekly_sales_sm1_1', 'weekly_sales_sm3_1', 'cpi0', 'fuel_price0', | |
'isholiday0', 'markdown10', 'markdown20', 'markdown30', 'markdown40', | |
'markdown50', 'temperature0', 'unemployment0', 'weekly_sales0', | |
'isholiday1', 'temperature1', 'weekly_sales_1ly1', 'isholiday2', | |
'temperature2', 'weekly_sales_1ly2'], | |
dtype='object') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def neg_sq_loss(labels, preds): | |
a = -0.1 | |
err = preds - labels | |
grad = 2 * err * (np.sign(err) + a)**2 | |
hess = 2 * (np.sign(err) + a)**2 | |
return grad, hess |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.multioutput import MultiOutputRegressor | |
import lightgbm as lgb | |
gbm = lgb.LGBMRegressor( | |
objective=neg_sq_loss, | |
n_jobs=-1, | |
num_leaves=51, | |
learning_rate=0.02, | |
n_estimators=500, | |
categorical_feature=[0, 1, 2] | |
) |