Created
May 27, 2020 20:30
-
-
Save ResidentMario/d3767d5f3944ca95218f83e5ec0f5b44 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pathlib import Path | |
path = Path('rossmann') | |
train_df = pd.read_pickle('/mnt/rossman-fastai-sample/train_clean').drop(['index', 'Date'], axis='columns') | |
test_df = pd.read_pickle('/mnt/rossman-fastai-sample/test_clean') | |
from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
from sklearn.pipeline import FeatureUnion, Pipeline | |
import numpy as np | |
cat_vars = [ | |
'Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', | |
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', | |
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw', | |
'SchoolHoliday_fw', 'SchoolHoliday_bw' | |
] | |
cont_vars = [ | |
'CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', | |
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', | |
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE', | |
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday' | |
] | |
target_var= 'Sales' | |
class ColumnFilter: | |
def fit(self, X, y): | |
return self | |
def transform(self, X): | |
return X.loc[:, cat_vars + cont_vars] | |
class GroupLabelEncoder: | |
def __init__(self): | |
self.labeller = LabelEncoder() | |
def fit(self, X, y): | |
self.encoders = {col: None for col in X.columns if col in cat_vars} | |
for col in self.encoders: | |
self.encoders[col] = LabelEncoder().fit( | |
X[col].fillna(value='N/A').values | |
) | |
return self | |
def transform(self, X): | |
X_out = [] | |
categorical_part = np.hstack([ | |
self.encoders[col].transform(X[col].fillna(value='N/A').values)[:, np.newaxis] | |
for col in cat_vars | |
]) | |
return pd.DataFrame(categorical_part, columns=cat_vars) | |
class GroupNullImputer: | |
def fit(self, X, y): | |
return self | |
def transform(self, X): | |
return X.loc[:, cont_vars].fillna(0) | |
class Preprocessor: | |
def __init__(self): | |
self.cf = ColumnFilter() | |
self.gne = GroupNullImputer() | |
def fit(self, X, y=None): | |
self.gle = GroupLabelEncoder().fit(X, y=None) | |
return self | |
def transform(self, X): | |
X_out = self.cf.transform(X) | |
X_out = np.hstack((self.gle.transform(X_out).values, self.gne.transform(X_out).values)) | |
X_out = pd.DataFrame(X_out, columns=cat_vars + cont_vars) | |
return X_out | |
X_train_sample = Preprocessor().fit(train_df).transform(train_df) | |
y_train_sample = train_df[target_var] | |
import torch | |
from torch import nn | |
import torch.utils.data | |
# ^ https://discuss.pytorch.org/t/attributeerror-module-torch-utils-has-no-attribute-data/1666 | |
class FeedforwardTabularModel(nn.Module): | |
def __init__(self): | |
super().__init__() | |
self.batch_size = 512 | |
self.base_lr, self.max_lr = 0.001, 0.003 | |
self.n_epochs = 20 | |
self.cat_vars_embedding_vector_lengths = [ | |
(1115, 80), (7, 4), (3, 3), (12, 6), (31, 10), (2, 2), (25, 10), (26, 10), (4, 3), | |
(3, 3), (4, 3), (23, 9), (8, 4), (12, 6), (52, 15), (22, 9), (6, 4), (6, 4), (3, 3), | |
(3, 3), (8, 4), (8, 4) | |
] | |
self.loss_fn = torch.nn.MSELoss() | |
self.score_fn = torch.nn.MSELoss() | |
# Layer 1: embeddings. | |
self.embeddings = [] | |
for i, (in_size, out_size) in enumerate(self.cat_vars_embedding_vector_lengths): | |
emb = nn.Embedding(in_size, out_size) | |
self.embeddings.append(emb) | |
setattr(self, f'emb_{i}', emb) | |
# Layer 1: dropout. | |
self.embedding_dropout = nn.Dropout(0.04) | |
# Layer 1: batch normalization (of the continuous variables). | |
self.cont_batch_norm = nn.BatchNorm1d(16, eps=1e-05, momentum=0.1) | |
# Layers 2 through 9: sequential feedforward model. | |
self.seq_model = nn.Sequential(*[ | |
nn.Linear(in_features=215, out_features=1000, bias=True), | |
nn.ReLU(), | |
nn.BatchNorm1d(1000, eps=1e-05, momentum=0.1), | |
nn.Dropout(p=0.001), | |
nn.Linear(in_features=1000, out_features=500, bias=True), | |
nn.ReLU(), | |
nn.BatchNorm1d(500, eps=1e-05, momentum=0.1), | |
nn.Dropout(p=0.01), | |
nn.Linear(in_features=500, out_features=1, bias=True) | |
]) | |
def forward(self, x): | |
# Layer 1: embeddings. | |
inp_offset = 0 | |
embedding_subvectors = [] | |
for emb in self.embeddings: | |
index = torch.tensor(inp_offset, dtype=torch.int64).cuda() | |
inp = torch.index_select(x, dim=1, index=index).long().cuda() | |
out = emb(inp) | |
out = out.view(out.shape[2], out.shape[0], 1).squeeze() | |
embedding_subvectors.append(out) | |
inp_offset += 1 | |
out_cat = torch.cat(embedding_subvectors) | |
out_cat = out_cat.view(out_cat.shape[::-1]) | |
# Layer 1: dropout. | |
out_cat = self.embedding_dropout(out_cat) | |
# Layer 1: batch normalization (of the continuous variables). | |
out_cont = self.cont_batch_norm(x[:, inp_offset:]) | |
out = torch.cat((out_cat, out_cont), dim=1) | |
# Layers 2 through 9: sequential feedforward model. | |
out = self.seq_model(out) | |
return out | |
def fit(self, X, y): | |
self.train() | |
# TODO: set a random seed to invoke determinism. | |
# Cf. GH#11278 | |
X = torch.tensor(X, dtype=torch.float32) | |
y = torch.tensor(y, dtype=torch.float32) | |
# OneCycleLR with Adam. | |
# | |
# Implementation notes. OneCyceLR by default cycles both the learning rate /and/ the | |
# momentum value. | |
# Cf. https://www.kaggle.com/residentmario/one-cycle-learning-rate-schedulers | |
# | |
# Optimizers that don't support momentum must use a scheduler with cycle_momentum=False, | |
# which disables the momentum-tuning behavior. Adam does not support momentum; it has its | |
# own similar-ish thing built in. | |
# Cf. https://www.kaggle.com/residentmario/keras-optimizers | |
# | |
# This code requires PyTorch >= 1.2 due to a bug, see GH#19003. | |
optimizer = torch.optim.Adam(self.parameters(), lr=self.max_lr) | |
scheduler = torch.optim.lr_scheduler.OneCycleLR( | |
optimizer, self.max_lr, | |
cycle_momentum=False, | |
epochs=self.n_epochs, | |
steps_per_epoch=int(np.ceil(len(X) / self.batch_size)), | |
) | |
batches = torch.utils.data.DataLoader( | |
torch.utils.data.TensorDataset(X, y), | |
batch_size=self.batch_size, shuffle=True | |
) | |
for epoch in range(self.n_epochs): | |
lvs = [] | |
for i, (X_batch, y_batch) in enumerate(batches): | |
X_batch = X_batch.cuda() | |
y_batch = y_batch.cuda() | |
y_pred = model(X_batch).squeeze() | |
optimizer.step() | |
scheduler.step() | |
loss = self.loss_fn(y_pred, y_batch) | |
optimizer.zero_grad() | |
loss.backward() | |
lv = loss.detach().cpu().numpy() | |
lvs.append(lv) | |
if i % 100 == 0: | |
print(f"Epoch {epoch + 1}/{self.n_epochs}; Batch {i}; Loss {lv}") | |
print( | |
f"Epoch {epoch + 1}/{self.n_epochs}; Average Loss {np.mean(lvs)}" | |
) | |
def predict(self, X): | |
self.eval() | |
with torch.no_grad(): | |
y_pred = model(torch.tensor(X, dtype=torch.float32).cuda()) | |
return y_pred.squeeze() | |
def score(self, X, y): | |
y_pred = self.predict(X) | |
y = torch.tensor(y, dtype=torch.float32).cuda() | |
return self.score_fn(y, y_pred) | |
model = FeedforwardTabularModel() | |
model.cuda() | |
model.fit(X_train_sample.values, y_train_sample.values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment