Skip to content

Instantly share code, notes, and snippets.

@auser
Last active April 16, 2020 18:09
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save auser/13fd8e41380836f2f6452f7bff42b54f to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
np.random.seed(1)
# train comes from the titantic dataset provided by
# kaggle (https://www.kaggle.com/c/titanic/data)
df = pd.read_csv('./data/titanic-train.csv')
def preprocess(raw_data):
# Preprocess data
# Convert to binary fields
dummy_fields = ['Pclass', 'Embarked', 'Sex']
dummies = pd.get_dummies(raw_data[dummy_fields])
data = pd.concat([raw_data, dummies], axis=1)
# drop other fields
fields_to_drop = ['PassengerId', 'Ticket', 'Parch',
'Name', 'Cabin', 'Fare', 'Pclass',
'Embarked', 'Sex', 'Sex_male']
data = data.drop(fields_to_drop, axis=1)
mean, std = data['Age'].mean(), data['Age'].std()
data.loc[:, 'Age'] = (data['Age'] - mean) / std
data = data.fillna(0)
data = data.sample(frac=1).reset_index(drop=True)
X = data.drop('Survived', axis=1).values
y = data[['Survived']].values
return X, y
train = df.sample(frac=0.8, random_state=200)
test = df.drop(train.index)
X_train, y_train = preprocess(train)
X_test, y_test = preprocess(test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment