h5li/TitanicSolutionNotebook.py

## TitanicSolutionNotebook.py

# coding: utf-8

# # Titanic Predictor
#
# This code takes csv data from the Kaggle website, cleans it, and runs it through a random forest classifier.

# In[23]:


import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier


# In[24]:


# Read in training and testing csvs
train_df = pd.read_csv(open('train.csv'))
test_df = pd.read_csv(open('train.csv'))


# In[25]:


# Clean the train dataset
#dropping name, ticket, cabin, ID, and embarked columns
train_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
# One-hot encoding Gender
train_df['m'] = (train_df['Sex'] == 'male')
train_df['f'] = (train_df['Sex'] == 'female')
train_df.drop(['Sex'], axis=1, inplace=True)
# Replace Nan values in age column with average age
train_mean_age = train_df['Age'].mean()
train_df.fillna(value=train_mean_age, inplace=True);


# In[26]:


# Clean the test dataset in the same way as the train dataset
#dropping name, ticket, cabin, ID, and embarked columns
test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
# One-hot encoding Gender
test_df['m'] = (test_df['Sex'] == 'male')
test_df['f'] = (test_df['Sex'] == 'female')
test_df.drop(['Sex'], axis=1, inplace=True)
# Replace Nan values in age column with average age
test_mean_age = test_df['Age'].mean()
test_df.fillna(value=test_mean_age, inplace=True);


# In[27]:


#Fill NaN value in fare column with mean fare in only the test dataset as it has some missing values
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)


# In[28]:


# Drop the PassengerId column from the training dataset as we will not need it for training
#(Note: Keep it in the test dataset though)
train_df.drop(['PassengerId'], axis=1, inplace=True)
train_df.head()


# In[30]:


# Initialize a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)


# In[43]:


# Holds survival data; this is what we want to predict-- the label
y = train_df['Survived']
x = train_df.iloc[:,1:]


# In[45]:


# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(x, y);


# In[46]:


# Apply the Classifier we trained to the test data
survive = clf.predict(test_df.iloc[:, 2:])


# In[63]:


# Zip together the passengerIds column and the predictions column that we created in the previous steps
final = zip(test_df['PassengerId'], survive)


# In[66]:


# Create a dataframe from the last cell's columns
final_prediction = pd.DataFrame(final, columns=['PassengerId', 'Survived'])


# In[101]:


# Output dataframe to CSV
final_prediction.to_csv('final_prediction8.csv', index=False)

	# coding: utf-8

	# # Titanic Predictor
	#
	# This code takes csv data from the Kaggle website, cleans it, and runs it through a random forest classifier.

	# In[23]:


	import numpy as np
	import pandas as pd
	import sklearn as sk
	import matplotlib.pyplot as plt
	# Load scikit's random forest classifier library
	from sklearn.ensemble import RandomForestClassifier


	# In[24]:


	# Read in training and testing csvs
	train_df = pd.read_csv(open('train.csv'))
	test_df = pd.read_csv(open('train.csv'))


	# In[25]:


	# Clean the train dataset
	#dropping name, ticket, cabin, ID, and embarked columns
	train_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
	# One-hot encoding Gender
	train_df['m'] = (train_df['Sex'] == 'male')
	train_df['f'] = (train_df['Sex'] == 'female')
	train_df.drop(['Sex'], axis=1, inplace=True)
	# Replace Nan values in age column with average age
	train_mean_age = train_df['Age'].mean()
	train_df.fillna(value=train_mean_age, inplace=True);


	# In[26]:


	# Clean the test dataset in the same way as the train dataset
	#dropping name, ticket, cabin, ID, and embarked columns
	test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
	# One-hot encoding Gender
	test_df['m'] = (test_df['Sex'] == 'male')
	test_df['f'] = (test_df['Sex'] == 'female')
	test_df.drop(['Sex'], axis=1, inplace=True)
	# Replace Nan values in age column with average age
	test_mean_age = test_df['Age'].mean()
	test_df.fillna(value=test_mean_age, inplace=True);


	# In[27]:


	#Fill NaN value in fare column with mean fare in only the test dataset as it has some missing values
	test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)


	# In[28]:


	# Drop the PassengerId column from the training dataset as we will not need it for training
	#(Note: Keep it in the test dataset though)
	train_df.drop(['PassengerId'], axis=1, inplace=True)
	train_df.head()


	# In[30]:


	# Initialize a random forest Classifier. By convention, clf means 'Classifier'
	clf = RandomForestClassifier(n_jobs=2, random_state=0)


	# In[43]:


	# Holds survival data; this is what we want to predict-- the label
	y = train_df['Survived']
	x = train_df.iloc[:,1:]


	# In[45]:


	# Train the Classifier to take the training features and learn how they relate
	# to the training y (the species)
	clf.fit(x, y);


	# In[46]:


	# Apply the Classifier we trained to the test data
	survive = clf.predict(test_df.iloc[:, 2:])


	# In[63]:


	# Zip together the passengerIds column and the predictions column that we created in the previous steps
	final = zip(test_df['PassengerId'], survive)


	# In[66]:


	# Create a dataframe from the last cell's columns
	final_prediction = pd.DataFrame(final, columns=['PassengerId', 'Survived'])


	# In[101]:


	# Output dataframe to CSV
	final_prediction.to_csv('final_prediction8.csv', index=False)