An artificial neural network for the Titanic challenge on Kaggle
import torch | |
import pandas as pd | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.model_selection import train_test_split | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
np.random.seed(10000) | |
# Load data | |
train = pd.read_csv("train.csv", index_col='PassengerId') | |
test = pd.read_csv("test.csv", index_col='PassengerId') | |
# Merge train and test for wrangling and preprocessing | |
train_test_datasets = [train, test] | |
"""Data wrangling""" | |
# Split cabin into letter and number | |
median_age = train["Age"].median() | |
median_fare = train["Fare"].median() | |
for idx, dataset in enumerate(train_test_datasets): | |
dataset["Age"].fillna(median_age, inplace=True) | |
dataset["Fare"].fillna(median_fare, inplace=True) | |
dataset["Cabin Letter"] = dataset["Cabin"].str.slice(0, 1) | |
dataset.drop("Cabin", axis=1, inplace=True) | |
#dataset["Embarked"] = dataset["Embarked"].cat.codes | |
dataset.drop(["Name", "Ticket"], axis=1, inplace=True) | |
categorical_cols = ["Pclass", "Sex", "Embarked", "Cabin Letter","SibSp"] | |
train_dummies = pd.get_dummies(train, | |
columns=categorical_cols, | |
prefix=categorical_cols, | |
dummy_na=True) | |
test_dummies = pd.get_dummies(test, | |
columns=categorical_cols, | |
prefix=categorical_cols, | |
dummy_na=True) | |
test_dummies["Cabin Letter_T"] = np.zeros(test_dummies.shape[0]) | |
"""BALANCING DATA""" | |
number_surviving = (train_dummies['Survived'] == 1).sum() # Number of survivors | |
bool_survivors = train_dummies['Survived'] == 1 | |
bool_nonsurvivors = train_dummies['Survived'] == 0 | |
all_survivors = train_dummies[bool_survivors] | |
all_nonsurvivors = train_dummies[bool_nonsurvivors] | |
random_nonsurvivors = all_nonsurvivors.sample(number_surviving) | |
train_balanced = pd.concat((all_survivors, random_nonsurvivors)) | |
train_balanced = train_balanced.sample(frac=1) | |
scaler = StandardScaler() | |
scaler.fit(train_dummies.iloc[:,1:]) | |
"""STANDARDIZATION""" | |
train_scaled = scaler.transform(train_balanced.iloc[:,1:]) | |
test_scaled = scaler.transform(test_dummies) | |
train_scaled = pd.DataFrame(train_scaled, | |
index=train_balanced.index, | |
columns=train_balanced.iloc[:,1:].columns) | |
test_scaled = pd.DataFrame(test_scaled, | |
index=test_dummies.index, | |
columns=test_dummies.columns) | |
y = train_balanced["Survived"] | |
"""VALIDATION DATA SPLIT""" | |
# Do a train, validate dataset split | |
X = train_scaled | |
X_train, X_validate, y_train, y_validate = train_test_split(X, y, | |
test_size=0.1) | |
"""Setup the network""" | |
train_features = torch.tensor(X_train.to_numpy()) | |
train_labels = torch.tensor(y_train.to_numpy()) | |
validation_features = torch.tensor(X_validate.to_numpy()) | |
validation_labels = torch.tensor(y_validate.to_numpy()) | |
"""Mini Batches""" | |
n_batches = 41 | |
train_features_batched = train_features.reshape(41, | |
int(train_features.shape[0]/n_batches), | |
train_features.shape[1]) | |
train_labels_batched = train_labels.reshape(n_batches, | |
int(train_labels.shape[0]/n_batches)) | |
n_features = train_features.shape[1] | |
model = torch.nn.Sequential(torch.nn.Linear(n_features, 50), | |
torch.nn.ReLU(), | |
torch.nn.Linear(50, 1), | |
torch.nn.Sigmoid()) | |
model = model.float() | |
criterion = torch.nn.BCELoss() | |
#optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=0.001) | |
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001) | |
n_epochs = 2000 | |
loss_list = [] | |
validate_loss_list = [] | |
for epoch in range(n_epochs): # loop over the dataset multiple times | |
for batch_idx in range(n_batches): | |
optimizer.zero_grad() | |
outputs = model(train_features_batched[batch_idx].float()) | |
loss = criterion(outputs.flatten().float(), | |
train_labels_batched[batch_idx].float()) | |
loss.backward() | |
optimizer.step() | |
outputs = model(train_features.float()) | |
validation_outputs = model(validation_features.float()) | |
loss = criterion(outputs.flatten().float(), | |
train_labels.float()) | |
validate_loss = criterion(validation_outputs.flatten().float(), | |
validation_labels.float()) | |
loss_list.append(loss.item()) | |
validate_loss_list.append(validate_loss) | |
print('Finished Training') | |
plt.rcParams['svg.fonttype'] = 'none' | |
sns.set(context='paper', | |
style='whitegrid', | |
palette='colorblind', | |
font='Arial', | |
font_scale=2, | |
color_codes=True) | |
plt.plot(loss_list, linewidth=3) | |
plt.plot(validate_loss_list, linewidth=3) | |
plt.legend(("Training Loss", "Validation Loss")) | |
plt.xlabel("Epoch") | |
plt.ylabel("BCE Loss") | |
test_features = torch.tensor(test_scaled.to_numpy()) | |
test_prediction = model(test_features.float()).detach().numpy().flatten() | |
test_prediction_binary = (test_prediction > 0.5).astype(np.int) | |
test_prediction_df = pd.DataFrame(test_prediction_binary, | |
index=test.index, | |
columns=["Survived"]) | |
test_prediction_df.to_csv("prediction_submission_trained.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment