Skip to content

Instantly share code, notes, and snippets.

@awcull
Created April 4, 2014 17:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save awcull/9978920 to your computer and use it in GitHub Desktop.
Save awcull/9978920 to your computer and use it in GitHub Desktop.
Kaggle - Titanic Attempt
library(Hmisc)
setwd("C:/code/titanic")
## read.data
# Because we know what the classes the columns should be lets deal with that now
read.data <- function(fp, col.classes) {
return(read.csv(file=fp, colClasses=col.classes))
}
## impute.data
impute.data <- function(data, filter) {
filt.levels <- levels(filter)
for (k in 1:length(filt.levels)) {
data[which(filter == filt.levels[k])] <- impute(data[which(filter == filt.levels[k])])
}
return(data)
}
# Set up data ahead of time since some values already are encoded as 'factors' but will show up as numeric
col.class <- c("numeric", # pass id
"factor", # surivival
"factor", #pclass
"character", # name
"factor", # sex
"numeric", # age
"numeric", # sibsp
"numeric", # parch
"character", # ticket
"numeric", # fare
"character", # cabin
"factor" # embarked
)
# Read in training data
train.df <- read.data("train.csv", col.class)
# Read in test data
col.class.test <- col.class[-2]
test.df <- read.data("test.csv", col.class.test)
## Look at the data for missing values
summary(train.df)
# Can see that there are 177 Age values missing, a number of 0 fares and missing two embark location.
# Look at age first by finding missing values
train.df[is.na(train.df$Age) | train.df$Age == 0,]
table(train.df[is.na(train.df$Age) | train.df$Age == 0,"Pclass"]) # Lots of class 3 missing compared to the rest
boxplot(Age ~ Pclass, data=train.df, main="Age vs Class", xlab="Class", ylab="Fare") # Looks like ages are are somewhat indicative of class so use this to impute
# Impute ages
train.df$Age <- impute.data(train.df$Age, train.df$Pclass)
test.df$Age <- impute.data(test.df$Age, test.df$Pclass)
# Look at 0 fares or missing fares and large fairs
train.df$Fare[train.df$Fare == 0] <- NA
test.df$Fare[test.df$Fare == 0] <- NA
boxplot(Fare ~ Pclass, data=train.df, main="Fare vs Class", xlab="Class", ylab="Fare")
# The large values seem out of place (> 500)
train.df[which(train.df$Fare > 500),] # Shows that prices seem legit
# Looks to be relation between each of the classes and fare, so impute on this again
train.df$Fare <- impute.data(train.df$Fare, train.df$Pclass)
test.df$Fare <- impute.data(test.df$Fare, test.df$Pclass)
# Look at embark locations
table(train.df$Embarked)
train.df$Embarked[train.df$Embarked == ""] <- "S"
train.df$Embarked <- as.factor(as.character(train.df$Embarked))
test.df$Embarked[test.df$Embarked == ""] <- "S"
test.df$Embarked <- as.factor(as.character(test.df$Embarked))
#################################################################################################################################
## Create training data
train <- sample(1:nrow(train.df),round(nrow(train.df)*0.8))
#################################################################################################################################
## Random Forests
library(randomForest)
model.rf <- randomForest(Survived ~ Age + Sex + Fare + Embarked, data=train.df, importance=T)
importance(model.rf)
pred.model.rf <- predict(model.rf, test.df, OOB=T, type="response")
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=pred.model.rf)
write.csv(submit, "model_rf_1.csv", row.names=F) # Attempt 1 - 0.77512
model.rf.2 <- randomForest(Survived ~ Age + Sex + Fare + Embarked + Parch + SibSp, data=train.df, importance=T)
importance(model.rf.2)
pred.model.rf.2 <- predict(model.rf.2, test.df, OOB=T, type="response")
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=pred.model.rf)
write.csv(submit, "model_rf_2.csv", row.names=F) # Attempt 2 - because adding more 'data' will improve a fit but didnt improve test data - 0.77512
model.rf.3 <- randomForest(Survived ~ Age + Sex + Fare + Embarked, data=train.df, subset=train, importance=T)
importance(model.rf.3)
pred.model.rf.3 <- predict(model.rf.3, test.df[-train,], OOB=T, type="response")
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=pred.model.rf)
write.csv(submit, "model_rf_3.csv", row.names=F) # attempt 4 - 0.77033
#################################################################################################################################
## Boosting - Not working with gbm, not sure why yet
# library(gbm)
#################################################################################################################################
## SVM
library(e1071)
model.svm <- svm(Survived ~ Age + Sex + Fare + Embarked, data=train.df, kernel="linear",scale=T, cost=10)
pred.model.svm <- predict(model.svm, test.df)
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=pred.model.svm)
write.csv(submit, "model_svm_1.csv", row.names=F) # Attempt 3
#################################################################################################################################
## Alternative
library(ada)
library(caret)
library(pROC)
# Set up cv control
cv.ctrl <- trainControl(method = "repeatedcv", repeats = 10, number = 10, summaryFunction = twoClassSummary, classProbs = TRUE)
## Random forest
rf.grid <- data.frame(.mtry=c(1,2,3))
rf.tune <- train(Survived ~ Age + Sex + Fare + Embarked, data=train.df[train,], method="rf", metric="ROC", tuneGride=rf.grid, trControl=cv.ctrl)
rf.pred <- predict(rf.tune, train.df[-train,])
confusionMatrix(rf.pred, train.df[-train,2])
rf.pred.sub <- predict(rf.tune, test.df)
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=rf.pred.sub)
write.csv(submit, "model_rf_train_1.csv", row.names=F) # Attempt 5 - 0.77033
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment