Created
April 4, 2014 17:14
-
-
Save awcull/9978920 to your computer and use it in GitHub Desktop.
Kaggle - Titanic Attempt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(Hmisc) | |
setwd("C:/code/titanic") | |
## read.data | |
# Because we know what the classes the columns should be lets deal with that now | |
read.data <- function(fp, col.classes) { | |
return(read.csv(file=fp, colClasses=col.classes)) | |
} | |
## impute.data | |
impute.data <- function(data, filter) { | |
filt.levels <- levels(filter) | |
for (k in 1:length(filt.levels)) { | |
data[which(filter == filt.levels[k])] <- impute(data[which(filter == filt.levels[k])]) | |
} | |
return(data) | |
} | |
# Set up data ahead of time since some values already are encoded as 'factors' but will show up as numeric | |
col.class <- c("numeric", # pass id | |
"factor", # surivival | |
"factor", #pclass | |
"character", # name | |
"factor", # sex | |
"numeric", # age | |
"numeric", # sibsp | |
"numeric", # parch | |
"character", # ticket | |
"numeric", # fare | |
"character", # cabin | |
"factor" # embarked | |
) | |
# Read in training data | |
train.df <- read.data("train.csv", col.class) | |
# Read in test data | |
col.class.test <- col.class[-2] | |
test.df <- read.data("test.csv", col.class.test) | |
## Look at the data for missing values | |
summary(train.df) | |
# Can see that there are 177 Age values missing, a number of 0 fares and missing two embark location. | |
# Look at age first by finding missing values | |
train.df[is.na(train.df$Age) | train.df$Age == 0,] | |
table(train.df[is.na(train.df$Age) | train.df$Age == 0,"Pclass"]) # Lots of class 3 missing compared to the rest | |
boxplot(Age ~ Pclass, data=train.df, main="Age vs Class", xlab="Class", ylab="Fare") # Looks like ages are are somewhat indicative of class so use this to impute | |
# Impute ages | |
train.df$Age <- impute.data(train.df$Age, train.df$Pclass) | |
test.df$Age <- impute.data(test.df$Age, test.df$Pclass) | |
# Look at 0 fares or missing fares and large fairs | |
train.df$Fare[train.df$Fare == 0] <- NA | |
test.df$Fare[test.df$Fare == 0] <- NA | |
boxplot(Fare ~ Pclass, data=train.df, main="Fare vs Class", xlab="Class", ylab="Fare") | |
# The large values seem out of place (> 500) | |
train.df[which(train.df$Fare > 500),] # Shows that prices seem legit | |
# Looks to be relation between each of the classes and fare, so impute on this again | |
train.df$Fare <- impute.data(train.df$Fare, train.df$Pclass) | |
test.df$Fare <- impute.data(test.df$Fare, test.df$Pclass) | |
# Look at embark locations | |
table(train.df$Embarked) | |
train.df$Embarked[train.df$Embarked == ""] <- "S" | |
train.df$Embarked <- as.factor(as.character(train.df$Embarked)) | |
test.df$Embarked[test.df$Embarked == ""] <- "S" | |
test.df$Embarked <- as.factor(as.character(test.df$Embarked)) | |
################################################################################################################################# | |
## Create training data | |
train <- sample(1:nrow(train.df),round(nrow(train.df)*0.8)) | |
################################################################################################################################# | |
## Random Forests | |
library(randomForest) | |
model.rf <- randomForest(Survived ~ Age + Sex + Fare + Embarked, data=train.df, importance=T) | |
importance(model.rf) | |
pred.model.rf <- predict(model.rf, test.df, OOB=T, type="response") | |
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=pred.model.rf) | |
write.csv(submit, "model_rf_1.csv", row.names=F) # Attempt 1 - 0.77512 | |
model.rf.2 <- randomForest(Survived ~ Age + Sex + Fare + Embarked + Parch + SibSp, data=train.df, importance=T) | |
importance(model.rf.2) | |
pred.model.rf.2 <- predict(model.rf.2, test.df, OOB=T, type="response") | |
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=pred.model.rf) | |
write.csv(submit, "model_rf_2.csv", row.names=F) # Attempt 2 - because adding more 'data' will improve a fit but didnt improve test data - 0.77512 | |
model.rf.3 <- randomForest(Survived ~ Age + Sex + Fare + Embarked, data=train.df, subset=train, importance=T) | |
importance(model.rf.3) | |
pred.model.rf.3 <- predict(model.rf.3, test.df[-train,], OOB=T, type="response") | |
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=pred.model.rf) | |
write.csv(submit, "model_rf_3.csv", row.names=F) # attempt 4 - 0.77033 | |
################################################################################################################################# | |
## Boosting - Not working with gbm, not sure why yet | |
# library(gbm) | |
################################################################################################################################# | |
## SVM | |
library(e1071) | |
model.svm <- svm(Survived ~ Age + Sex + Fare + Embarked, data=train.df, kernel="linear",scale=T, cost=10) | |
pred.model.svm <- predict(model.svm, test.df) | |
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=pred.model.svm) | |
write.csv(submit, "model_svm_1.csv", row.names=F) # Attempt 3 | |
################################################################################################################################# | |
## Alternative | |
library(ada) | |
library(caret) | |
library(pROC) | |
# Set up cv control | |
cv.ctrl <- trainControl(method = "repeatedcv", repeats = 10, number = 10, summaryFunction = twoClassSummary, classProbs = TRUE) | |
## Random forest | |
rf.grid <- data.frame(.mtry=c(1,2,3)) | |
rf.tune <- train(Survived ~ Age + Sex + Fare + Embarked, data=train.df[train,], method="rf", metric="ROC", tuneGride=rf.grid, trControl=cv.ctrl) | |
rf.pred <- predict(rf.tune, train.df[-train,]) | |
confusionMatrix(rf.pred, train.df[-train,2]) | |
rf.pred.sub <- predict(rf.tune, test.df) | |
submit <- data.frame(PassengerId=test.df$PassengerId, Survived=rf.pred.sub) | |
write.csv(submit, "model_rf_train_1.csv", row.names=F) # Attempt 5 - 0.77033 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment