Skip to content

Instantly share code, notes, and snippets.

@h5li
Created October 17, 2018 17:39
Show Gist options
  • Save h5li/fa38fcdab4b49bb95bfd9ae0fa89fc74 to your computer and use it in GitHub Desktop.
Save h5li/fa38fcdab4b49bb95bfd9ae0fa89fc74 to your computer and use it in GitHub Desktop.
# coding: utf-8
# # Titanic Predictor
#
# This code takes csv data from the Kaggle website, cleans it, and runs it through a random forest classifier.
# In[23]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
# In[24]:
# Read in training and testing csvs
train_df = pd.read_csv(open('train.csv'))
test_df = pd.read_csv(open('train.csv'))
# In[25]:
# Clean the train dataset
#dropping name, ticket, cabin, ID, and embarked columns
train_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
# One-hot encoding Gender
train_df['m'] = (train_df['Sex'] == 'male')
train_df['f'] = (train_df['Sex'] == 'female')
train_df.drop(['Sex'], axis=1, inplace=True)
# Replace Nan values in age column with average age
train_mean_age = train_df['Age'].mean()
train_df.fillna(value=train_mean_age, inplace=True);
# In[26]:
# Clean the test dataset in the same way as the train dataset
#dropping name, ticket, cabin, ID, and embarked columns
test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
# One-hot encoding Gender
test_df['m'] = (test_df['Sex'] == 'male')
test_df['f'] = (test_df['Sex'] == 'female')
test_df.drop(['Sex'], axis=1, inplace=True)
# Replace Nan values in age column with average age
test_mean_age = test_df['Age'].mean()
test_df.fillna(value=test_mean_age, inplace=True);
# In[27]:
#Fill NaN value in fare column with mean fare in only the test dataset as it has some missing values
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)
# In[28]:
# Drop the PassengerId column from the training dataset as we will not need it for training
#(Note: Keep it in the test dataset though)
train_df.drop(['PassengerId'], axis=1, inplace=True)
train_df.head()
# In[30]:
# Initialize a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)
# In[43]:
# Holds survival data; this is what we want to predict-- the label
y = train_df['Survived']
x = train_df.iloc[:,1:]
# In[45]:
# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(x, y);
# In[46]:
# Apply the Classifier we trained to the test data
survive = clf.predict(test_df.iloc[:, 2:])
# In[63]:
# Zip together the passengerIds column and the predictions column that we created in the previous steps
final = zip(test_df['PassengerId'], survive)
# In[66]:
# Create a dataframe from the last cell's columns
final_prediction = pd.DataFrame(final, columns=['PassengerId', 'Survived'])
# In[101]:
# Output dataframe to CSV
final_prediction.to_csv('final_prediction8.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment