Skip to content

Instantly share code, notes, and snippets.

@koljamaier
Last active October 28, 2016 06:56
Show Gist options
  • Save koljamaier/b2301cdcf436f27527676df5c6c64c6a to your computer and use it in GitHub Desktop.
Save koljamaier/b2301cdcf436f27527676df5c6c64c6a to your computer and use it in GitHub Desktop.
Naive Bayes in the Kaggle Titanic Competition
from __future__ import division
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
import pandas as pd
import csv as csv
import seaborn as sns
from numpy import matrix, mat
import re
fig, (ax1,ax2,ax3) = plt.subplots(1, 3, figsize=(15,5))
df = pd.read_csv('train.csv', header=0)
# Fill in only missing Embarked value
df["Embarked"] = df["Embarked"].fillna("S")
# Fehlende Werte von "Age" auffüllen mit geschätzten Werten
df["NumEmbarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2}).astype(int)
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
df["AgeNaN"] = df.Age.fillna(1).map(lambda num: num==1)
median_agedds = [mat(np.zeros((3,3))), mat(np.zeros((3,3)))]
for i in range(0, 2):
for j in range(0, 3):
for k in range(0,3):
median_agedds[i][j, k] = df[(df['Gender'] == i) & (df['Pclass'] == j+1) & (df["NumEmbarked"] == k)]['Age'].dropna().mean()
df['AgeFill'] = df['Age']
for i in range(0, 2):
for j in range(0, 3):
for k in range(0, 3):
# New values for AgeFill are being filled in
df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1) & (df["NumEmbarked"]==k),'AgeFill'] = median_agedds[i][j, k]
# Assume that passengers with Parch==2 are most likely children (decrease age)
df.loc[df["Parch"]==2, "AgeFill"] = df[(df["Parch"]==2)]["Age"].dropna().mean()
# Assume that passengers with Parch==1 & SibSp==1 are most likely married parents (increase age)
df.loc[(df["Parch"]==1)&(df["SibSp"]==1), "AgeFill"] = df[(df["Parch"]==1)&(df["SibSp"]==1)]["Age"].dropna().mean()
df["Title"] = df["Name"].map(lambda name: re.sub('(.*, )|(\\..*.)', "", name))
df.loc[df["Title"]=="Mme", "Title"] = "Mrs"
df.loc[(df["Title"]=="Ms") | (df["Title"]=="Mlle"), "Title"] = "Miss"
rare_title = ['Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer']
df.loc[(df["Title"].isin(rare_title)), "Title"] = "Rare Title"
fig, axel = plt.subplots(1, 1, figsize=(15,5))
axel.set_title("Appearance of titles of survivors")
sns.countplot(x="Title", data=df[df["Survived"]==1])
df["Title"] = df["Title"].map({"Mrs": 0, "Miss": 1, "Mr": 2, "Master":3, "Rare Title":4}).astype(int)
# The following commented block can be used to investigate the age of passengers
# with Parch==2 (in most cases that means its a child, because it has mother
# and father on board)
#bins = [0, 25, 100]
#group_names = ['Young', "Old"]
#df["AgeCat"] = pd.cut(df.Age.dropna(), bins, labels=group_names)
#possibleChildrens = df[(df["Parch"]==2)][["Age","SibSp", "AgeCat"]]
#sns.countplot(x="SibSp", hue="AgeCat", data=bsps, ax=axis3)
# Age of females embarked in S
embSfemale = df[(df["Embarked"]=="S") & (df["Gender"]==0)]
fig1, axis1 = plt.subplots(1,1, figsize=(15,5))
sns.countplot(x="Age", data=embSfemale, ax=axis1)
# define "prior" from the training data
p1 = (df[df["Survived"]==1]["Survived"].count())/891
# define the probabilites for the categorical variable Pclass
# P("Pclass" | Survived = 1)
p1Pclass = np.zeros(3)
for i in range(1,4):
p1Pclass[i-1] = df[(df["Pclass"]==i) & (df["Survived"]==1)]["Pclass"].count()
p1Pclass = np.log(p1Pclass/([sum(p1Pclass)]*len(p1Pclass)))
# P("Pclass" | Survived = 0)
p0Pclass = np.zeros(3)
for i in range(1,4):
p0Pclass[i-1] = df[(df["Pclass"]==i) & (df["Survived"]==0)]["Pclass"].count()
p0Pclass = np.log(p0Pclass/([sum(p0Pclass)]*len(p0Pclass)))
# P("Title" | Survived = 1)
p1Title = np.zeros(5)
for i in range(0,5):
p1Title[i] = df[(df["Title"]==i) & (df["Survived"]==1)]["Title"].count()
p1Title = np.log(p1Title/([sum(p1Title)]*len(p1Title)))
# P("Title" | Survived = 0)
p0Title = np.zeros(5)
for i in range(0,5):
p0Title[i] = df[(df["Title"]==i) & (df["Survived"]==0)]["Title"].count()
p0Title = np.log(p0Title/([sum(p0Title)]*len(p0Title)))
# P("Age" | Survived = 1)
p1Age = norm(loc = df[df["Survived"]==1]["AgeFill"].mean(), scale = df[df["Survived"]==1]["AgeFill"].std())
x1 = np.linspace(p1Age.ppf(0.01), p1Age.ppf(0.99), 100)
ax1.set_title("Chance to survive conditioned on Age")
ax1.plot(x1, p1Age.pdf(x1))
# P("Age" | Survived = 0)
p0Age = norm(loc = df[df["Survived"]==0]["AgeFill"].mean(), scale = df[df["Survived"]==0]["AgeFill"].std())
x1 = np.linspace(p0Age.ppf(0.01), p0Age.ppf(0.99), 100)
ax1.plot(x1, p0Age.pdf(x1))
# Original Age
pAge1 = norm(loc = df[df["Survived"]==1]["Age"].mean(), scale = df[df["Survived"]==1]["Age"].std())
x1 = np.linspace(pAge1.ppf(0.01), pAge1.ppf(0.99), 100)
ax1.plot(x1, pAge1.pdf(x1))
fig1, (axis1, axis2) = plt.subplots(1,2, figsize=(15,5))
axis1.set_title("Original Age")
df["Age"].hist(bins=70, ax=axis1)
axis2.set_title("Filled in Age")
df["AgeFill"].hist(bins=70, ax=axis2)
# P("Fare" | Survived = 1)
p1Fare = norm(loc = df[df["Survived"]==1]["Fare"].mean(), scale = df[df["Survived"]==1]["Fare"].std())
# P("Fare" | Survived = 0)
p0Fare = norm(loc = df[df["Survived"]==0]["Fare"].mean(), scale = df[df["Survived"]==0]["Fare"].std())
fig2, ax1 = plt.subplots(1,1, figsize=(15,5))
ax1.set_title("Chance to survive conditioned on paid Fare")
x1 = np.linspace(p1Fare.ppf(0.01), p1Fare.ppf(0.99), 100)
ax1.plot(x1, p1Fare.pdf(x1))
x1 = np.linspace(p0Fare.ppf(0.01), p0Fare.ppf(0.99), 100)
ax1.plot(x1, p0Fare.pdf(x1), color="r")
# P("Gender" | Survived = 1)
p1Gender = np.zeros(2)
totalSurvived = df[df["Survived"]==1]["Survived"].count()
p1Gender[1] = df[(df.Gender==1) & (df.Survived==1)]["Survived"].count()/totalSurvived
p1Gender[0] = df[(df.Gender==0) & (df.Survived==1)]["Survived"].count()/totalSurvived
p1Gender = np.log(p1Gender)
# P("Gender" | Survived = 0)
p0Gender = np.zeros(2)
totalDead = df[df["Survived"]==0]["Survived"].count()
p0Gender[1] = df[(df.Gender==1) & (df.Survived==0)]["Survived"].count()/totalDead
p0Gender[0] = df[(df.Gender==0) & (df.Survived==0)]["Survived"].count()/totalDead
p0Gender = np.log(p0Gender)
sns.set_style("whitegrid")
sns.countplot(x="Gender", data=df[df["Survived"]==1], ax=ax2)
ax3.set_title("Age of female survivors")
sns.countplot(x="Age", data=df[(df["Survived"]==1) & df["Gender"]==0], ax=ax3)
# Classify Test Data
"""
Classifies a new passenger
Parameters:
vec2Classify - [PClass, Age, Gender, Fare]
"""
def classify(vec2Classify):
p1Vec = np.array([p1Pclass[vec2Classify[0]-1],p1Age.logpdf(vec2Classify[1]), p1Gender[vec2Classify[2]], p1Fare.logpdf(vec2Classify[3]), p1Title[vec2Classify[4]]])
#p1Vec = np.array([p1Age.logpdf(vec2Classify[1])])
p0Vec = np.array([p0Pclass[vec2Classify[0]-1],p0Age.logpdf(vec2Classify[1]), p0Gender[vec2Classify[2]], p0Fare.logpdf(vec2Classify[3]), p0Title[vec2Classify[4]]])
#p0Vec = np.array([p0Age.logpdf(vec2Classify[1])])
p1c = sum(p1Vec) + np.log(p1) # the log-sum equals the mult of the different feature likelihoods sum(p1Vec)
p0c = sum(p0Vec) + np.log(1.0 - p1)
if p1c > p0c:
return 1
else:
return 0
df_test = pd.read_csv('test.csv', header=0)
# Fill in only missing Embarked value
df_test["Embarked"] = df_test["Embarked"].fillna("S")
df_test["Fare"].fillna(df_test["Fare"].median(), inplace=True)
# Fehlende Werte von "Age" auffüllen mit geschätzten Werten
df_test['AgeFill'] = df_test['Age']
df_test["NumEmbarked"] = df_test["Embarked"].map({"S": 0, "C": 1, "Q": 2}).astype(int)
df_test['Gender'] = df_test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
for i in range(0, 2):
for j in range(0, 3):
for k in range(0, 3):
# New values for AgeFill are being filled in
df_test.loc[ (df_test.Age.isnull()) & (df_test.Gender == i) & (df_test.Pclass == j+1) & (df_test["NumEmbarked"]==k),'AgeFill'] = median_agedds[i][j, k]
df_test.loc[df_test["Parch"]==2, "AgeFill"] = df[(df["Parch"]==2)]["Age"].dropna().mean()
df_test.loc[(df_test["Parch"]==1)&(df_test["SibSp"]==1), "AgeFill"] = df[(df["Parch"]==1)&(df["SibSp"]==1)]["Age"].dropna().mean()
df_test["Title"] = df_test["Name"].map(lambda name: re.sub('(.*, )|(\\..*.)', "", name))
df_test.loc[df_test["Title"]=="Mme", "Title"] = "Mrs"
df_test.loc[(df_test["Title"]=="Ms") | (df_test["Title"]=="Mlle"), "Title"] = "Miss"
rare_title = ['Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer']
df_test.loc[(df_test["Title"].isin(rare_title)), "Title"] = "Rare Title"
df_test["Title"] = df_test["Title"].map({"Mrs": 0, "Miss": 1, "Mr": 2, "Master":3, "Rare Title":4}).astype(int)
df_test = df_test[["PassengerId", "Pclass", "AgeFill", "Gender", "Fare", "Title"]]
df_test["Survived"] = np.nan
test_file = df_test.values
predictions_file = open("naivebayesmodel.csv", "wb")
predictions_file_object = csv.writer(predictions_file)
predictions_file_object.writerow(["PassengerId", "Survived"])
for row in test_file:
if classify([row[1], row[2], row[3], row[4], row[5]]):
predictions_file_object.writerow([row[0].astype(int), "1"])
df_test.set_value(row[0].astype(int)-892, "Survived", 1)
else:
predictions_file_object.writerow([row[0].astype(int), "0"])
df_test.set_value(row[0].astype(int)-892, "Survived", 0)
predictions_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment