Skip to content

Instantly share code, notes, and snippets.

@danielmk
Created February 2, 2021 16:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save danielmk/ebeee0232bf1022614c4ab0886a34359 to your computer and use it in GitHub Desktop.
Save danielmk/ebeee0232bf1022614c4ab0886a34359 to your computer and use it in GitHub Desktop.
An artificial neural network for the Titanic challenge on Kaggle
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(10000)
# Load data
train = pd.read_csv("train.csv", index_col='PassengerId')
test = pd.read_csv("test.csv", index_col='PassengerId')
# Merge train and test for wrangling and preprocessing
train_test_datasets = [train, test]
"""Data wrangling"""
# Split cabin into letter and number
median_age = train["Age"].median()
median_fare = train["Fare"].median()
for idx, dataset in enumerate(train_test_datasets):
dataset["Age"].fillna(median_age, inplace=True)
dataset["Fare"].fillna(median_fare, inplace=True)
dataset["Cabin Letter"] = dataset["Cabin"].str.slice(0, 1)
dataset.drop("Cabin", axis=1, inplace=True)
#dataset["Embarked"] = dataset["Embarked"].cat.codes
dataset.drop(["Name", "Ticket"], axis=1, inplace=True)
categorical_cols = ["Pclass", "Sex", "Embarked", "Cabin Letter","SibSp"]
train_dummies = pd.get_dummies(train,
columns=categorical_cols,
prefix=categorical_cols,
dummy_na=True)
test_dummies = pd.get_dummies(test,
columns=categorical_cols,
prefix=categorical_cols,
dummy_na=True)
test_dummies["Cabin Letter_T"] = np.zeros(test_dummies.shape[0])
"""BALANCING DATA"""
number_surviving = (train_dummies['Survived'] == 1).sum() # Number of survivors
bool_survivors = train_dummies['Survived'] == 1
bool_nonsurvivors = train_dummies['Survived'] == 0
all_survivors = train_dummies[bool_survivors]
all_nonsurvivors = train_dummies[bool_nonsurvivors]
random_nonsurvivors = all_nonsurvivors.sample(number_surviving)
train_balanced = pd.concat((all_survivors, random_nonsurvivors))
train_balanced = train_balanced.sample(frac=1)
scaler = StandardScaler()
scaler.fit(train_dummies.iloc[:,1:])
"""STANDARDIZATION"""
train_scaled = scaler.transform(train_balanced.iloc[:,1:])
test_scaled = scaler.transform(test_dummies)
train_scaled = pd.DataFrame(train_scaled,
index=train_balanced.index,
columns=train_balanced.iloc[:,1:].columns)
test_scaled = pd.DataFrame(test_scaled,
index=test_dummies.index,
columns=test_dummies.columns)
y = train_balanced["Survived"]
"""VALIDATION DATA SPLIT"""
# Do a train, validate dataset split
X = train_scaled
X_train, X_validate, y_train, y_validate = train_test_split(X, y,
test_size=0.1)
"""Setup the network"""
train_features = torch.tensor(X_train.to_numpy())
train_labels = torch.tensor(y_train.to_numpy())
validation_features = torch.tensor(X_validate.to_numpy())
validation_labels = torch.tensor(y_validate.to_numpy())
"""Mini Batches"""
n_batches = 41
train_features_batched = train_features.reshape(41,
int(train_features.shape[0]/n_batches),
train_features.shape[1])
train_labels_batched = train_labels.reshape(n_batches,
int(train_labels.shape[0]/n_batches))
n_features = train_features.shape[1]
model = torch.nn.Sequential(torch.nn.Linear(n_features, 50),
torch.nn.ReLU(),
torch.nn.Linear(50, 1),
torch.nn.Sigmoid())
model = model.float()
criterion = torch.nn.BCELoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=0.001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001)
n_epochs = 2000
loss_list = []
validate_loss_list = []
for epoch in range(n_epochs): # loop over the dataset multiple times
for batch_idx in range(n_batches):
optimizer.zero_grad()
outputs = model(train_features_batched[batch_idx].float())
loss = criterion(outputs.flatten().float(),
train_labels_batched[batch_idx].float())
loss.backward()
optimizer.step()
outputs = model(train_features.float())
validation_outputs = model(validation_features.float())
loss = criterion(outputs.flatten().float(),
train_labels.float())
validate_loss = criterion(validation_outputs.flatten().float(),
validation_labels.float())
loss_list.append(loss.item())
validate_loss_list.append(validate_loss)
print('Finished Training')
plt.rcParams['svg.fonttype'] = 'none'
sns.set(context='paper',
style='whitegrid',
palette='colorblind',
font='Arial',
font_scale=2,
color_codes=True)
plt.plot(loss_list, linewidth=3)
plt.plot(validate_loss_list, linewidth=3)
plt.legend(("Training Loss", "Validation Loss"))
plt.xlabel("Epoch")
plt.ylabel("BCE Loss")
test_features = torch.tensor(test_scaled.to_numpy())
test_prediction = model(test_features.float()).detach().numpy().flatten()
test_prediction_binary = (test_prediction > 0.5).astype(np.int)
test_prediction_df = pd.DataFrame(test_prediction_binary,
index=test.index,
columns=["Survived"])
test_prediction_df.to_csv("prediction_submission_trained.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment