Skip to content

Instantly share code, notes, and snippets.

@pdyraga
Created November 25, 2016 09:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pdyraga/4bd72c43532b2ec2224a5e54ae85716b to your computer and use it in GitHub Desktop.
Save pdyraga/4bd72c43532b2ec2224a5e54ae85716b to your computer and use it in GitHub Desktop.
# VARIABLE DESCRIPTIONS:
# survival Survival
#(0 = No; 1 = Yes)
#pclass Passenger Class
#(1 = 1st; 2 = 2nd; 3 = 3rd)
#name Name
#sex Sex
#age Age
#sibsp Number of Siblings/Spouses Aboard
#parch Number of Parents/Children Aboard
#ticket Ticket Number
#fare Passenger Fare
#cabin Cabin
#embarked Port of Embarkation
#(C = Cherbourg; Q = Queenstown; S = Southampton)
install.packages('rattle')
install.packages('RGtk2')
install.packages('rpart.plot')
install.packages('RColorBrewer')
install.packages('randomForest')
install.packages('party')
# Set working directory and import datafiles
setwd("~/git/hackaton-data-science")
library(readr)
train <- read_csv("train.csv")
View(train)
# Sum up number of those who survived and those who didn't
table(train$Survived)
# Evaluate % of survivors per sex and age (child / adult)
train$Child <- 0
train$Child[train$Age < 18] <- 1
aggregate(Survived ~ Child + Sex, data=train, FUN=function(x) {sum(x)/length(x)})
# ^ we can do it naively here - each adult male dies, each adult female survives
# instead of looking up patterns manually (e.g. gender, age and how they relate to death rate),
# let's use decision trees. Greedy, scans all of the variables for the best one to split on.
# The way it measures this is to make the split on the variable that results in the most pure
# nodes below
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
data=train,
method="class")
fancyRpartPlot(fit)
train$Prediction <- predict(fit, train, type = "class")
mean(train$Prediction == train$Survived) # [1] 0.8395062
# ----------
# Let's do some feature engineering and take into account data that we didn't look at before:
# title (Mr, Ms, Mrs etc.), family size and family ID (maybe one family was more likely to
# survive than other)
# Let's use test data to make our data set for building decisions bigger
test <- read_csv("test.csv")
test$Survived <- NA
test$Prediction <- NA
test$Child <- 0
test$Child[test$Age < 18] <- 1
View(test)
combi <- rbind(train, test)
# Extract titles...
combi$Name <- as.character(combi$Name)
combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
combi$Title <- sub(' ', '', combi$Title)
combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
combi$Title <- factor(combi$Title)
table(combi$Title) # YES, YES, YES!
# Extract family size...
combi$FamilySize <- combi$SibSp + combi$Parch + 1
# Extract family ID (we'll use family size + surname composite key)
combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
famIDs <- data.frame(table(combi$FamilyID))
famIDs <- famIDs[famIDs$Freq <= 2,]
combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
combi$FamilyID <- factor(combi$FamilyID)
train <- combi[1:891,]
test <- combi[892:1309,]
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID,
data=train,
method="class")
train$Prediction <- predict(fit, train, type = "class")
mean(train$Prediction == train$Survived) # [1] 0.8552189
# NICE, WE ARE GETTING BETTER AND BETTER!
Prediction <- predict(fit, test, type = "class")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "feature_engineering.csv", row.names = FALSE) # 0.79426
# Mean looks nice, but still, tree can be overfitted. Let's use random forest now.
# Take a large collection of individually imperfect models, and their one-off mistakes
# are probably not going to be made by the rest of them. If we average the results of
# all these models, we can sometimes find a superior model from their combination than
# any of the individual parts.
# rpart has a great advantage in that it can use surrogate variables when it encounters
# NA value. Unfortunately, it doesn't work for random forest. In our dataset there
# are a lot of age values missing and we need to fix it first.
# And lets use decision tree to do it!
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
data=combi[!is.na(combi$Age),],
method="anova")
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
# Also, Embarked and Fare are missing
# just two Embarked missing so let's just hardcode them...
combi$Embarked[c(62,830)] = "S"
combi$Embarked <- factor(combi$Embarked)
# only one Fare is missing, let's replace it with a median fare
combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)
# Another problem: Random Forests in R can only digest factors with up to 32 levels.
# Our FamilyID variable had almost double that. Let's manually reduce the number of
# levels to keep it under the threshold.
combi$FamilyID2 <- combi$FamilyID
combi$FamilyID2 <- as.character(combi$FamilyID2)
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
combi$FamilyID2 <- factor(combi$FamilyID2)
# Let's build random forest now!!!
library(randomForest)
set.seed(415)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
Embarked + Title + FamilySize + FamilyID2,
data=train,
importance=TRUE,
ntree=2000)
train$Prediction <- predict(fit, train, type = "class")
mean(train$Prediction == train$Survived) # [1] 0.8552189
# :( NO IMPROVEMENT HERE...
Prediction <- predict(fit, test, type = "class")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "random_forest.csv", row.names = FALSE)
# Let's try a forest of conditional inference trees. They make their decisions in slightly
# different ways, using a statistical test rather than a purity measure, but the basic
# construction of each tree is fairly similar.
library (party)
sapply(train, class)
set.seed(415)
train$Sex <- factor(train$Sex)
train$Embarked <- factor(train$Embarked)
fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare +
Embarked + Title + FamilySize + FamilyID,
data = train,
controls = cforest_unbiased(ntree=2000, mtry=3))
train$Prediction <- predict(fit, train, type = "class")
mean(train$Prediction == train$Survived) # [1] 0.8552189
Prediction <- predict(fit, test, OOB=TRUE, type = "response")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "conditional_inference_trees.csv", row.names = FALSE)
# RESULTS on test data (kaggle)
# feature_engineering 0.79426
# decision_trees 0.79426 // no improvement here :(
# conditional_inference_trees 0.80861
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment