Skip to content

Instantly share code, notes, and snippets.

@oskar-j
Last active August 29, 2015 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oskar-j/a50e1bab232bf5a96bfa to your computer and use it in GitHub Desktop.
Save oskar-j/a50e1bab232bf5a96bfa to your computer and use it in GitHub Desktop.
Make classifier for Titanic passengers and try to guess who survives the catastrophe
setwd("C:/big data/kaggle/titanic/")
library(ggplot2)
library(caret)
library(doParallel)
library(randomForest)
library(stringr)
registerDoParallel(detectCores())
train <- read.csv("train.csv", header = T, na.strings = c(""," ","NA"),
sep = ",", dec = ".", stringsAsFactors = F,
row.names = 1)
test <- read.csv("test.csv", header = T, na.strings = c(""," ","NA"),
sep = ",", dec = ".", stringsAsFactors = F)
print("Data read")
# View(train)
# Partitioning data for train and test cases, p=0.75 by rule of thumb
# train$Survived = as.factor(train$Survived)
train$Pclass = as.factor(train$Pclass)
train$Sex = as.factor(train$Sex)
train$SibSp = as.factor(train$SibSp)
train$Parch = as.factor(train$Parch)
train$Embarked = as.factor(train$Embarked)
test$Pclass = as.factor(test$Pclass)
test$Sex = as.factor(test$Sex)
test$SibSp = as.factor(test$SibSp)
test$Parch = as.factor(test$Parch)
test$Embarked = as.factor(test$Embarked)
train$Nickname <- ifelse(grepl("\"[A-Za-z]+\"",train$Name), 1, 0)
train$Nickname = as.factor(train$Nickname)
test$Nickname <- ifelse(grepl("\"[A-Za-z]+\"",test$Name), 1, 0)
test$Nickname = as.factor(test$Nickname)
# outliers
train$Ticket <- str_replace(train$Ticket, "STON/O 2. ", "STON/O2. ")
train$Ticket <- str_replace(train$Ticket, "LINE", "LINE 0")
train$Ticket <- str_replace(train$Ticket, "SC/AH Basle ", "SC/AH/Basle ")
test$Ticket <- str_replace(test$Ticket, "STON/O 2. ", "STON/O2. ")
test$Ticket <- str_replace(test$Ticket, "LINE", "LINE 0")
test$Ticket <- str_replace(test$Ticket, "SC/AH Basle ", "SC/AH/Basle ")
train$TicketSeries <- str_extract(train$Ticket, perl('\\S+(?=\\s+)'))
train$TicketSeries = as.factor(train$TicketSeries)
test$TicketSeries <- str_extract(test$Ticket, perl('\\S+(?=\\s+)'))
test$TicketSeries = as.factor(test$TicketSeries)
train$Ticket <- ifelse(grepl("\\s", train$Ticket),
str_trim(sub('\\S*', '\\1', train$Ticket)),
train$Ticket)
train$Ticket = as.numeric(train$Ticket)
test$Ticket <- ifelse(grepl("\\s", test$Ticket),
str_trim(sub('\\S*', '\\1', test$Ticket)),
test$Ticket)
test$Ticket = as.numeric(test$Ticket)
# which(is.na(train$Ticket))
sapply(train, class)
testIndex = createDataPartition(train$Survived, p = 3/4)[[1]]
testing = train[-testIndex,]
training = train[testIndex,]
testing$Survived = as.factor(testing$Survived)
training$Survived = as.factor(training$Survived)
train$Survived = as.factor(train$Survived)
# training <- training[ order(row.names(training)), ]
colNamesRD <- c("Survived", "Pclass","Sex","Age","SibSp",
"Parch","Ticket","Fare",
"Embarked", "Nickname", "TicketSeries")
training <- training[,colNamesRD]
testing <- testing[,colNamesRD]
training <- rfImpute(Survived ~ ., training)
modelFit <- train(Survived ~ ., method="rf",
trControl = trainControl(
method = "oob",
preProcOptions = list(thresh = 0.85)
),
data=training)
testing <- rfImpute(Survived ~ ., testing)
confusionMatrix(testing$Survived, predict(modelFit,testing))
train <- train[,colNamesRD]
train <- rfImpute(Survived ~ ., train)
modelFit <- train(Survived ~ ., method="rf",
trControl = trainControl(
method = "oob",
preProcOptions = list(thresh = 0.85)
),
data=train)
# remove extra levels in ticketseries factor :(
idt <- which(!(test$TicketSeries %in% levels(train$TicketSeries)))
idp <- which(!(test$Parch %in% levels(train$Parch)))
test$TicketSeries[idt] <- NA
test$Parch[idp] <- NA
test <- test[,-which(names(test) %in% c("Name","Cabin"))]
test <- na.roughfix(test)
test$Survived <- predict(modelFit,test)
write.csv(test[,c("PassengerId", "Survived")],
file="results.csv", quote = F, row.names = F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment