Skip to content

Instantly share code, notes, and snippets.

@ResidentMario
Created May 27, 2020 20:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ResidentMario/d3767d5f3944ca95218f83e5ec0f5b44 to your computer and use it in GitHub Desktop.
Save ResidentMario/d3767d5f3944ca95218f83e5ec0f5b44 to your computer and use it in GitHub Desktop.
import pandas as pd
from pathlib import Path
path = Path('rossmann')
train_df = pd.read_pickle('/mnt/rossman-fastai-sample/train_clean').drop(['index', 'Date'], axis='columns')
test_df = pd.read_pickle('/mnt/rossman-fastai-sample/test_clean')
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
import numpy as np
cat_vars = [
'Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
'SchoolHoliday_fw', 'SchoolHoliday_bw'
]
cont_vars = [
'CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h',
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday'
]
target_var= 'Sales'
class ColumnFilter:
def fit(self, X, y):
return self
def transform(self, X):
return X.loc[:, cat_vars + cont_vars]
class GroupLabelEncoder:
def __init__(self):
self.labeller = LabelEncoder()
def fit(self, X, y):
self.encoders = {col: None for col in X.columns if col in cat_vars}
for col in self.encoders:
self.encoders[col] = LabelEncoder().fit(
X[col].fillna(value='N/A').values
)
return self
def transform(self, X):
X_out = []
categorical_part = np.hstack([
self.encoders[col].transform(X[col].fillna(value='N/A').values)[:, np.newaxis]
for col in cat_vars
])
return pd.DataFrame(categorical_part, columns=cat_vars)
class GroupNullImputer:
def fit(self, X, y):
return self
def transform(self, X):
return X.loc[:, cont_vars].fillna(0)
class Preprocessor:
def __init__(self):
self.cf = ColumnFilter()
self.gne = GroupNullImputer()
def fit(self, X, y=None):
self.gle = GroupLabelEncoder().fit(X, y=None)
return self
def transform(self, X):
X_out = self.cf.transform(X)
X_out = np.hstack((self.gle.transform(X_out).values, self.gne.transform(X_out).values))
X_out = pd.DataFrame(X_out, columns=cat_vars + cont_vars)
return X_out
X_train_sample = Preprocessor().fit(train_df).transform(train_df)
y_train_sample = train_df[target_var]
import torch
from torch import nn
import torch.utils.data
# ^ https://discuss.pytorch.org/t/attributeerror-module-torch-utils-has-no-attribute-data/1666
class FeedforwardTabularModel(nn.Module):
def __init__(self):
super().__init__()
self.batch_size = 512
self.base_lr, self.max_lr = 0.001, 0.003
self.n_epochs = 20
self.cat_vars_embedding_vector_lengths = [
(1115, 80), (7, 4), (3, 3), (12, 6), (31, 10), (2, 2), (25, 10), (26, 10), (4, 3),
(3, 3), (4, 3), (23, 9), (8, 4), (12, 6), (52, 15), (22, 9), (6, 4), (6, 4), (3, 3),
(3, 3), (8, 4), (8, 4)
]
self.loss_fn = torch.nn.MSELoss()
self.score_fn = torch.nn.MSELoss()
# Layer 1: embeddings.
self.embeddings = []
for i, (in_size, out_size) in enumerate(self.cat_vars_embedding_vector_lengths):
emb = nn.Embedding(in_size, out_size)
self.embeddings.append(emb)
setattr(self, f'emb_{i}', emb)
# Layer 1: dropout.
self.embedding_dropout = nn.Dropout(0.04)
# Layer 1: batch normalization (of the continuous variables).
self.cont_batch_norm = nn.BatchNorm1d(16, eps=1e-05, momentum=0.1)
# Layers 2 through 9: sequential feedforward model.
self.seq_model = nn.Sequential(*[
nn.Linear(in_features=215, out_features=1000, bias=True),
nn.ReLU(),
nn.BatchNorm1d(1000, eps=1e-05, momentum=0.1),
nn.Dropout(p=0.001),
nn.Linear(in_features=1000, out_features=500, bias=True),
nn.ReLU(),
nn.BatchNorm1d(500, eps=1e-05, momentum=0.1),
nn.Dropout(p=0.01),
nn.Linear(in_features=500, out_features=1, bias=True)
])
def forward(self, x):
# Layer 1: embeddings.
inp_offset = 0
embedding_subvectors = []
for emb in self.embeddings:
index = torch.tensor(inp_offset, dtype=torch.int64).cuda()
inp = torch.index_select(x, dim=1, index=index).long().cuda()
out = emb(inp)
out = out.view(out.shape[2], out.shape[0], 1).squeeze()
embedding_subvectors.append(out)
inp_offset += 1
out_cat = torch.cat(embedding_subvectors)
out_cat = out_cat.view(out_cat.shape[::-1])
# Layer 1: dropout.
out_cat = self.embedding_dropout(out_cat)
# Layer 1: batch normalization (of the continuous variables).
out_cont = self.cont_batch_norm(x[:, inp_offset:])
out = torch.cat((out_cat, out_cont), dim=1)
# Layers 2 through 9: sequential feedforward model.
out = self.seq_model(out)
return out
def fit(self, X, y):
self.train()
# TODO: set a random seed to invoke determinism.
# Cf. GH#11278
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
# OneCycleLR with Adam.
#
# Implementation notes. OneCyceLR by default cycles both the learning rate /and/ the
# momentum value.
# Cf. https://www.kaggle.com/residentmario/one-cycle-learning-rate-schedulers
#
# Optimizers that don't support momentum must use a scheduler with cycle_momentum=False,
# which disables the momentum-tuning behavior. Adam does not support momentum; it has its
# own similar-ish thing built in.
# Cf. https://www.kaggle.com/residentmario/keras-optimizers
#
# This code requires PyTorch >= 1.2 due to a bug, see GH#19003.
optimizer = torch.optim.Adam(self.parameters(), lr=self.max_lr)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
optimizer, self.max_lr,
cycle_momentum=False,
epochs=self.n_epochs,
steps_per_epoch=int(np.ceil(len(X) / self.batch_size)),
)
batches = torch.utils.data.DataLoader(
torch.utils.data.TensorDataset(X, y),
batch_size=self.batch_size, shuffle=True
)
for epoch in range(self.n_epochs):
lvs = []
for i, (X_batch, y_batch) in enumerate(batches):
X_batch = X_batch.cuda()
y_batch = y_batch.cuda()
y_pred = model(X_batch).squeeze()
optimizer.step()
scheduler.step()
loss = self.loss_fn(y_pred, y_batch)
optimizer.zero_grad()
loss.backward()
lv = loss.detach().cpu().numpy()
lvs.append(lv)
if i % 100 == 0:
print(f"Epoch {epoch + 1}/{self.n_epochs}; Batch {i}; Loss {lv}")
print(
f"Epoch {epoch + 1}/{self.n_epochs}; Average Loss {np.mean(lvs)}"
)
def predict(self, X):
self.eval()
with torch.no_grad():
y_pred = model(torch.tensor(X, dtype=torch.float32).cuda())
return y_pred.squeeze()
def score(self, X, y):
y_pred = self.predict(X)
y = torch.tensor(y, dtype=torch.float32).cuda()
return self.score_fn(y, y_pred)
model = FeedforwardTabularModel()
model.cuda()
model.fit(X_train_sample.values, y_train_sample.values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment