pdyraga/titanic.r

## titanic.r
# VARIABLE DESCRIPTIONS:
#  survival        Survival
#(0 = No; 1 = Yes)
#pclass          Passenger Class
#(1 = 1st; 2 = 2nd; 3 = 3rd)
#name            Name
#sex             Sex
#age             Age
#sibsp           Number of Siblings/Spouses Aboard
#parch           Number of Parents/Children Aboard
#ticket          Ticket Number
#fare            Passenger Fare
#cabin           Cabin
#embarked        Port of Embarkation
#(C = Cherbourg; Q = Queenstown; S = Southampton)

install.packages('rattle')
install.packages('RGtk2')
install.packages('rpart.plot')
install.packages('RColorBrewer')
install.packages('randomForest')
install.packages('party')

# Set working directory and import datafiles
setwd("~/git/hackaton-data-science")

library(readr)
train <- read_csv("train.csv")
View(train)

# Sum up number of those who survived and those who didn't
table(train$Survived)

# Evaluate % of survivors per sex and age (child / adult)
train$Child <- 0
train$Child[train$Age < 18] <- 1
aggregate(Survived ~ Child + Sex, data=train, FUN=function(x) {sum(x)/length(x)})
# ^ we can do it naively here - each adult male dies, each adult female survives


# instead of looking up patterns manually (e.g. gender, age and how they relate to death rate),
# let's use decision trees. Greedy, scans all of the variables for the best one to split on.
# The way it measures this is to make the split on the variable that results in the most pure
# nodes below
library(rattle)
library(rpart.plot)
library(RColorBrewer)

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
             data=train,
             method="class")

fancyRpartPlot(fit)

train$Prediction <- predict(fit, train, type = "class")
mean(train$Prediction == train$Survived) # [1] 0.8395062

# ----------

# Let's do some feature engineering and take into account data that we didn't look at before:
# title (Mr, Ms, Mrs etc.), family size and family ID (maybe one family was more likely to
# survive than other)

# Let's use test data to make our data set for building decisions bigger
test <- read_csv("test.csv")
test$Survived <- NA
test$Prediction <- NA
test$Child <- 0
test$Child[test$Age < 18] <- 1

View(test)
combi <- rbind(train, test)

# Extract titles...
combi$Name <- as.character(combi$Name)
combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
combi$Title <- sub(' ', '', combi$Title)
combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
combi$Title <- factor(combi$Title)

table(combi$Title) # YES, YES, YES!

# Extract family size...
combi$FamilySize <- combi$SibSp + combi$Parch + 1

# Extract family ID (we'll use family size + surname composite key)
combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
famIDs <- data.frame(table(combi$FamilyID))
famIDs <- famIDs[famIDs$Freq <= 2,]
combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
combi$FamilyID <- factor(combi$FamilyID)
train <- combi[1:891,]
test <- combi[892:1309,]

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID,
             data=train,
             method="class")

train$Prediction <- predict(fit, train, type = "class")
mean(train$Prediction == train$Survived) # [1] 0.8552189

# NICE, WE ARE GETTING BETTER AND BETTER!

Prediction <- predict(fit, test, type = "class")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "feature_engineering.csv", row.names = FALSE) # 0.79426

# Mean looks nice, but still, tree can be overfitted. Let's use random forest now.
# Take a large collection of individually imperfect models, and their one-off mistakes
# are probably not going to be made by the rest of them. If we average the results of
# all these models, we can sometimes find a superior model from their combination than
# any of the individual parts.

# rpart has a great advantage in that it can use surrogate variables when it encounters
# NA value. Unfortunately, it doesn't work for random forest. In our dataset there
# are a lot of age values missing and we need to fix it first.

# And lets use decision tree to do it!
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
                data=combi[!is.na(combi$Age),],
                method="anova")
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

# Also, Embarked and Fare are missing

# just two Embarked missing so let's just hardcode them...
combi$Embarked[c(62,830)] = "S"
combi$Embarked <- factor(combi$Embarked)
# only one Fare is missing, let's replace it with a median fare
combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

# Another problem: Random Forests in R can only digest factors with up to 32 levels.
# Our FamilyID variable had almost double that. Let's manually reduce the number of
# levels to keep it under the threshold.
combi$FamilyID2 <- combi$FamilyID
combi$FamilyID2 <- as.character(combi$FamilyID2)
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
combi$FamilyID2 <- factor(combi$FamilyID2)

# Let's build random forest now!!!
library(randomForest)
set.seed(415)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
                      Embarked + Title + FamilySize + FamilyID2,
                    data=train,
                    importance=TRUE,
                    ntree=2000)

train$Prediction <- predict(fit, train, type = "class")
mean(train$Prediction == train$Survived) # [1] 0.8552189

# :( NO IMPROVEMENT HERE...

Prediction <- predict(fit, test, type = "class")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "random_forest.csv", row.names = FALSE)

# Let's try a forest of conditional inference trees. They make their decisions in slightly
# different ways, using a statistical test rather than a purity measure, but the basic
# construction of each tree is fairly similar.
library (party)

sapply(train, class)

set.seed(415)

train$Sex <- factor(train$Sex)
train$Embarked <- factor(train$Embarked)

fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
                 Embarked + Title + FamilySize + FamilyID,
               data = train,
               controls = cforest_unbiased(ntree=2000, mtry=3))

train$Prediction <- predict(fit, train, type = "class")
mean(train$Prediction == train$Survived) # [1] 0.8552189

Prediction <- predict(fit, test, OOB=TRUE, type = "response")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "conditional_inference_trees.csv", row.names = FALSE)

# RESULTS on test data (kaggle)
# feature_engineering 0.79426
# decision_trees 0.79426 // no improvement here :(
# conditional_inference_trees 0.80861
	# VARIABLE DESCRIPTIONS:
	# survival Survival
	#(0 = No; 1 = Yes)
	#pclass Passenger Class
	#(1 = 1st; 2 = 2nd; 3 = 3rd)
	#name Name
	#sex Sex
	#age Age
	#sibsp Number of Siblings/Spouses Aboard
	#parch Number of Parents/Children Aboard
	#ticket Ticket Number
	#fare Passenger Fare
	#cabin Cabin
	#embarked Port of Embarkation
	#(C = Cherbourg; Q = Queenstown; S = Southampton)

	install.packages('rattle')
	install.packages('RGtk2')
	install.packages('rpart.plot')
	install.packages('RColorBrewer')
	install.packages('randomForest')
	install.packages('party')

	# Set working directory and import datafiles
	setwd("~/git/hackaton-data-science")

	library(readr)
	train <- read_csv("train.csv")
	View(train)

	# Sum up number of those who survived and those who didn't
	table(train$Survived)

	# Evaluate % of survivors per sex and age (child / adult)
	train$Child <- 0
	train$Child[train$Age < 18] <- 1
	aggregate(Survived ~ Child + Sex, data=train, FUN=function(x) {sum(x)/length(x)})
	# ^ we can do it naively here - each adult male dies, each adult female survives


	# instead of looking up patterns manually (e.g. gender, age and how they relate to death rate),
	# let's use decision trees. Greedy, scans all of the variables for the best one to split on.
	# The way it measures this is to make the split on the variable that results in the most pure
	# nodes below
	library(rattle)
	library(rpart.plot)
	library(RColorBrewer)

	fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
	data=train,
	method="class")

	fancyRpartPlot(fit)

	train$Prediction <- predict(fit, train, type = "class")
	mean(train$Prediction == train$Survived) # [1] 0.8395062

	# ----------

	# Let's do some feature engineering and take into account data that we didn't look at before:
	# title (Mr, Ms, Mrs etc.), family size and family ID (maybe one family was more likely to
	# survive than other)

	# Let's use test data to make our data set for building decisions bigger
	test <- read_csv("test.csv")
	test$Survived <- NA
	test$Prediction <- NA
	test$Child <- 0
	test$Child[test$Age < 18] <- 1

	View(test)
	combi <- rbind(train, test)

	# Extract titles...
	combi$Name <- as.character(combi$Name)
	combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
	combi$Title <- sub(' ', '', combi$Title)
	combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
	combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
	combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
	combi$Title <- factor(combi$Title)

	table(combi$Title) # YES, YES, YES!

	# Extract family size...
	combi$FamilySize <- combi$SibSp + combi$Parch + 1

	# Extract family ID (we'll use family size + surname composite key)
	combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
	combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
	combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
	famIDs <- data.frame(table(combi$FamilyID))
	famIDs <- famIDs[famIDs$Freq <= 2,]
	combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
	combi$FamilyID <- factor(combi$FamilyID)
	train <- combi[1:891,]
	test <- combi[892:1309,]

	fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID,
	data=train,
	method="class")

	train$Prediction <- predict(fit, train, type = "class")
	mean(train$Prediction == train$Survived) # [1] 0.8552189

	# NICE, WE ARE GETTING BETTER AND BETTER!

	Prediction <- predict(fit, test, type = "class")
	submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
	write.csv(submit, file = "feature_engineering.csv", row.names = FALSE) # 0.79426

	# Mean looks nice, but still, tree can be overfitted. Let's use random forest now.
	# Take a large collection of individually imperfect models, and their one-off mistakes
	# are probably not going to be made by the rest of them. If we average the results of
	# all these models, we can sometimes find a superior model from their combination than
	# any of the individual parts.

	# rpart has a great advantage in that it can use surrogate variables when it encounters
	# NA value. Unfortunately, it doesn't work for random forest. In our dataset there
	# are a lot of age values missing and we need to fix it first.

	# And lets use decision tree to do it!
	Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
	data=combi[!is.na(combi$Age),],
	method="anova")
	combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

	# Also, Embarked and Fare are missing

	# just two Embarked missing so let's just hardcode them...
	combi$Embarked[c(62,830)] = "S"
	combi$Embarked <- factor(combi$Embarked)
	# only one Fare is missing, let's replace it with a median fare
	combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

	# Another problem: Random Forests in R can only digest factors with up to 32 levels.
	# Our FamilyID variable had almost double that. Let's manually reduce the number of
	# levels to keep it under the threshold.
	combi$FamilyID2 <- combi$FamilyID
	combi$FamilyID2 <- as.character(combi$FamilyID2)
	combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
	combi$FamilyID2 <- factor(combi$FamilyID2)

	# Let's build random forest now!!!
	library(randomForest)
	set.seed(415)
	fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
	Embarked + Title + FamilySize + FamilyID2,
	data=train,
	importance=TRUE,
	ntree=2000)

	train$Prediction <- predict(fit, train, type = "class")
	mean(train$Prediction == train$Survived) # [1] 0.8552189

	# :( NO IMPROVEMENT HERE...

	Prediction <- predict(fit, test, type = "class")
	submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
	write.csv(submit, file = "random_forest.csv", row.names = FALSE)

	# Let's try a forest of conditional inference trees. They make their decisions in slightly
	# different ways, using a statistical test rather than a purity measure, but the basic
	# construction of each tree is fairly similar.
	library (party)

	sapply(train, class)

	set.seed(415)

	train$Sex <- factor(train$Sex)
	train$Embarked <- factor(train$Embarked)

	fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
	Embarked + Title + FamilySize + FamilyID,
	data = train,
	controls = cforest_unbiased(ntree=2000, mtry=3))

	train$Prediction <- predict(fit, train, type = "class")
	mean(train$Prediction == train$Survived) # [1] 0.8552189

	Prediction <- predict(fit, test, OOB=TRUE, type = "response")
	submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
	write.csv(submit, file = "conditional_inference_trees.csv", row.names = FALSE)

	# RESULTS on test data (kaggle)
	# feature_engineering 0.79426
	# decision_trees 0.79426 // no improvement here :(
	# conditional_inference_trees 0.80861