Skip to content

Instantly share code, notes, and snippets.

@davidjwiner
Created March 4, 2018 01:47
Show Gist options
  • Save davidjwiner/e18d2fbfc8ed86d220720ea141e3cbdc to your computer and use it in GitHub Desktop.
Save davidjwiner/e18d2fbfc8ed86d220720ea141e3cbdc to your computer and use it in GitHub Desktop.
Team just working on my fitness's code
rm(list=ls())
source("http://www.stanford.edu/~bayati/oit367/T367_utilities_13_alpha.R")
setwd("/Users/davidjwiner/Dropbox/GSB/OIT\ 367/Prediction_Challenge")
training = read.csv("training.csv")
# randomizing the training data
training = training[sample(nrow(training)),]
test = read.csv("test.csv")
#-----------------------------------------------------------------------
#----------------- Some basic preprocessing
# Creating a dummy response for the test set, it will never be used but helps us stack
# both training and test on top of each other and preprocess them together.
test$RESPONSE = rep(0,nrow(test))
alldata = rbind(training,test)
# Creating a dummy for Male and one for Female. Note that we are not dropping one of these, as
# we did in the past, since there exist a third possibility (NA) which would be the third and
# dropped category
alldata$GenderF = as.numeric((alldata$Gender=="F")&(!is.na(alldata$Gender)))
alldata$GenderM = as.numeric((alldata$Gender=="M")&(!is.na(alldata$Gender)))
alldata$Gender=NULL
alldata$Profile_Status <- factor(alldata$Profile_Status)
alldata$City_Id <- factor(alldata$City_Id)
# We didn't end up using this approach
# threshold = 5000
# factor_features = c("Source","Profile_Status", "Language", "City_Id", "Country_Code", "Region", "Sub_Region")
#
# for (col in factor_features){
# levels(alldata[, col])[table(alldata[, col]) < threshold] = 'Other'
# }
#---------------------------------------------------------------------
# Partition data into training and validation sets
train.size = 0.8
train.ind = runif(nrow(training)) < train.size
trainingProcessed = alldata[train.ind, ]
validationProcessed = alldata[!train.ind, ]
testProcessed = alldata[-(1:nrow(training)),]
# Build decision tree model
model.tree = buildModel("RESPONSE", columns, trainingProcessed, type = 'classify', method = "decisionTree")
prp(model.tree, extra=4, fallen.leaves=TRUE, type=1, box.col=rainbow(30), varlen=0,digits=6,faclen=0)
# Output validation predictions
validationPredictions = predict(model, newdata = validationProcessed, type='classify')
pred_ROCR <- prediction(validationPredictions, validationProcessed$RESPONSE)
auc_ROCR <- performance(pred_ROCR, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
cat("The ROC value is ", auc_ROCR)
# Make test predictions and write output to file
predictions = genPred(model.tree, newdata=testProcessed, method='decisionTree')
submission = data.frame(Id=test$Id, Prediction = predictions)
write.csv(submission, file = "team_submission.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment