Created
October 17, 2018 17:39
-
-
Save h5li/fa38fcdab4b49bb95bfd9ae0fa89fc74 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# # Titanic Predictor | |
# | |
# This code takes csv data from the Kaggle website, cleans it, and runs it through a random forest classifier. | |
# In[23]: | |
import numpy as np | |
import pandas as pd | |
import sklearn as sk | |
import matplotlib.pyplot as plt | |
# Load scikit's random forest classifier library | |
from sklearn.ensemble import RandomForestClassifier | |
# In[24]: | |
# Read in training and testing csvs | |
train_df = pd.read_csv(open('train.csv')) | |
test_df = pd.read_csv(open('train.csv')) | |
# In[25]: | |
# Clean the train dataset | |
#dropping name, ticket, cabin, ID, and embarked columns | |
train_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) | |
# One-hot encoding Gender | |
train_df['m'] = (train_df['Sex'] == 'male') | |
train_df['f'] = (train_df['Sex'] == 'female') | |
train_df.drop(['Sex'], axis=1, inplace=True) | |
# Replace Nan values in age column with average age | |
train_mean_age = train_df['Age'].mean() | |
train_df.fillna(value=train_mean_age, inplace=True); | |
# In[26]: | |
# Clean the test dataset in the same way as the train dataset | |
#dropping name, ticket, cabin, ID, and embarked columns | |
test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) | |
# One-hot encoding Gender | |
test_df['m'] = (test_df['Sex'] == 'male') | |
test_df['f'] = (test_df['Sex'] == 'female') | |
test_df.drop(['Sex'], axis=1, inplace=True) | |
# Replace Nan values in age column with average age | |
test_mean_age = test_df['Age'].mean() | |
test_df.fillna(value=test_mean_age, inplace=True); | |
# In[27]: | |
#Fill NaN value in fare column with mean fare in only the test dataset as it has some missing values | |
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True) | |
# In[28]: | |
# Drop the PassengerId column from the training dataset as we will not need it for training | |
#(Note: Keep it in the test dataset though) | |
train_df.drop(['PassengerId'], axis=1, inplace=True) | |
train_df.head() | |
# In[30]: | |
# Initialize a random forest Classifier. By convention, clf means 'Classifier' | |
clf = RandomForestClassifier(n_jobs=2, random_state=0) | |
# In[43]: | |
# Holds survival data; this is what we want to predict-- the label | |
y = train_df['Survived'] | |
x = train_df.iloc[:,1:] | |
# In[45]: | |
# Train the Classifier to take the training features and learn how they relate | |
# to the training y (the species) | |
clf.fit(x, y); | |
# In[46]: | |
# Apply the Classifier we trained to the test data | |
survive = clf.predict(test_df.iloc[:, 2:]) | |
# In[63]: | |
# Zip together the passengerIds column and the predictions column that we created in the previous steps | |
final = zip(test_df['PassengerId'], survive) | |
# In[66]: | |
# Create a dataframe from the last cell's columns | |
final_prediction = pd.DataFrame(final, columns=['PassengerId', 'Survived']) | |
# In[101]: | |
# Output dataframe to CSV | |
final_prediction.to_csv('final_prediction8.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment