davidjwiner/just_working_on_my_fitness.R

## just_working_on_my_fitness.R
rm(list=ls())
source("http://www.stanford.edu/~bayati/oit367/T367_utilities_13_alpha.R")
setwd("/Users/davidjwiner/Dropbox/GSB/OIT\ 367/Prediction_Challenge")
training = read.csv("training.csv")

# randomizing the training data
training = training[sample(nrow(training)),]
test = read.csv("test.csv")

#-----------------------------------------------------------------------
#----------------- Some basic preprocessing
# Creating a dummy response for the test set, it will never be used but helps us stack
# both training and test on top of each other and preprocess them together.
test$RESPONSE = rep(0,nrow(test))

alldata = rbind(training,test)

# Creating a dummy for Male and one for Female. Note that we are not dropping one of these, as
#  we did in the past, since there exist a third possibility (NA) which would be the third and
#  dropped category
alldata$GenderF = as.numeric((alldata$Gender=="F")&(!is.na(alldata$Gender)))
alldata$GenderM = as.numeric((alldata$Gender=="M")&(!is.na(alldata$Gender)))
alldata$Gender=NULL

alldata$Profile_Status <- factor(alldata$Profile_Status)
alldata$City_Id <- factor(alldata$City_Id)

# We didn't end up using this approach

# threshold = 5000
# factor_features = c("Source","Profile_Status", "Language", "City_Id", "Country_Code", "Region", "Sub_Region")
#
# for (col in factor_features){
#   levels(alldata[, col])[table(alldata[, col]) < threshold] = 'Other'
# }

#---------------------------------------------------------------------

# Partition data into training and validation sets
train.size = 0.8
train.ind = runif(nrow(training)) < train.size

trainingProcessed = alldata[train.ind, ]
validationProcessed = alldata[!train.ind, ]
testProcessed = alldata[-(1:nrow(training)),]

# Build decision tree model
model.tree = buildModel("RESPONSE", columns, trainingProcessed, type = 'classify', method = "decisionTree")
prp(model.tree, extra=4, fallen.leaves=TRUE, type=1, box.col=rainbow(30), varlen=0,digits=6,faclen=0)

# Output validation predictions
validationPredictions = predict(model, newdata = validationProcessed, type='classify')
pred_ROCR <- prediction(validationPredictions, validationProcessed$RESPONSE)
auc_ROCR <- performance(pred_ROCR, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
cat("The ROC value is ", auc_ROCR)

# Make test predictions and write output to file
predictions  = genPred(model.tree, newdata=testProcessed, method='decisionTree')
submission = data.frame(Id=test$Id, Prediction = predictions)
write.csv(submission, file = "team_submission.csv", row.names = FALSE)
	rm(list=ls())
	source("http://www.stanford.edu/~bayati/oit367/T367_utilities_13_alpha.R")
	setwd("/Users/davidjwiner/Dropbox/GSB/OIT\ 367/Prediction_Challenge")
	training = read.csv("training.csv")

	# randomizing the training data
	training = training[sample(nrow(training)),]
	test = read.csv("test.csv")

	#-----------------------------------------------------------------------
	#----------------- Some basic preprocessing
	# Creating a dummy response for the test set, it will never be used but helps us stack
	# both training and test on top of each other and preprocess them together.
	test$RESPONSE = rep(0,nrow(test))

	alldata = rbind(training,test)

	# Creating a dummy for Male and one for Female. Note that we are not dropping one of these, as
	# we did in the past, since there exist a third possibility (NA) which would be the third and
	# dropped category
	alldata$GenderF = as.numeric((alldata$Gender=="F")&(!is.na(alldata$Gender)))
	alldata$GenderM = as.numeric((alldata$Gender=="M")&(!is.na(alldata$Gender)))
	alldata$Gender=NULL

	alldata$Profile_Status <- factor(alldata$Profile_Status)
	alldata$City_Id <- factor(alldata$City_Id)

	# We didn't end up using this approach

	# threshold = 5000
	# factor_features = c("Source","Profile_Status", "Language", "City_Id", "Country_Code", "Region", "Sub_Region")
	#
	# for (col in factor_features){
	# levels(alldata[, col])[table(alldata[, col]) < threshold] = 'Other'
	# }

	#---------------------------------------------------------------------

	# Partition data into training and validation sets
	train.size = 0.8
	train.ind = runif(nrow(training)) < train.size

	trainingProcessed = alldata[train.ind, ]
	validationProcessed = alldata[!train.ind, ]
	testProcessed = alldata[-(1:nrow(training)),]

	# Build decision tree model
	model.tree = buildModel("RESPONSE", columns, trainingProcessed, type = 'classify', method = "decisionTree")
	prp(model.tree, extra=4, fallen.leaves=TRUE, type=1, box.col=rainbow(30), varlen=0,digits=6,faclen=0)

	# Output validation predictions
	validationPredictions = predict(model, newdata = validationProcessed, type='classify')
	pred_ROCR <- prediction(validationPredictions, validationProcessed$RESPONSE)
	auc_ROCR <- performance(pred_ROCR, measure = "auc")
	auc_ROCR <- auc_ROCR@y.values[[1]]
	cat("The ROC value is ", auc_ROCR)

	# Make test predictions and write output to file
	predictions = genPred(model.tree, newdata=testProcessed, method='decisionTree')
	submission = data.frame(Id=test$Id, Prediction = predictions)
	write.csv(submission, file = "team_submission.csv", row.names = FALSE)