pdparker/GBMhiggs

## GBMhiggs
#Custom functions at end of file
setwd("/Users/phparker/Dropbox/kaggle/")
set.seed(100)
library(caret);library(glmnet);library(e1071); library(ROCR)
train = read.csv("Higgs/training.csv")
train.y = train[,32:33]
test = read.csv("Higgs/test.csv")
#Will create hold out set when comparing models or now it is fine to use whole set
#inTrain <- createDataPartition(train$Label, p = 1/2, list = FALSE)
#Re-label missing
train[train==-999] <- NA
test[test==-999] <- NA
#Maybe use neural net or bagged tree to impute missing values.
#Should be quicker and less fiddly than bayes regression models
#Eventually will ensemble models
#Forum notes that given many of the features are coordinate several are redundent.
#However with tree based methods this should not mater
#I have choosen note to use weights. I am not a fan in general and I am not sure how these were developed.

#--------------------------------------------------
# Train Model - Gradient Boosting Machine
library(gbm)
gbmGrid <-  expand.grid(interaction.depth = 1:5,
                        n.trees = (20:30)*50,
                        shrinkage = c(0.1)
)

myControl = trainControl(method='cv',number=5,repeats=2,returnResamp='none', verboseIter=TRUE)
gbm = train(f1, data=train, method = 'gbm', trControl=myControl, verbose=TRUE,
            tuneGrid= gbmGrid)
gbm
plot(gbm)
#------------------------------------------
#Implement tuned model
train$Label <- as.numeric(train$Label)-1
gbmModel = gbm(Label~.-Weight, data=train,
               cv.folds = 5,                   # 5 fold cross validation
               #weights=train$Weight,           # Set observation weights
               interaction.depth=5,
               n.trees=1150,
               shrinkage = .1,
               # Set interaction depth
               verbose=TRUE)                   # use verbose response to see model training
plot(gbmModel)
# Get predictions and determine cutoff threshold using pROC
gbmPrediction = predict(gbmModel, newdata=train, n.trees=gbmModel$n.trees, type="response")
#Get AUC alues
auc = roc(train$Label, gbmPrediction);auc
plot(auc, print.thres=TRUE)

# Threshold to set results
#I think I need to run cross validation to get the right threshold given that my AUC is quite high.
#I can get my AMS to climb quite high using different threshold values
threshold = 0.5
#Confusion Matrix
table(train$Label,gbmPrediction>=threshold)
#Predict and get AMS score
predicted=rep("b",250000)
predicted[gbmPrediction>=threshold]="s"
AMS(pred=predicted,real=train.y$Label,weight=train$Weight)
# Make predictins on test set and create submission file
gbmTestPrediction = predict(gbmModel, newdata=test, n.trees=gbmModel$n.trees, type="response")
#Create submission sheet
predicted=rep("b",550000)
predicted[gbmTestPrediction>=threshold]="s"
weightRank = rank(gbmTestPrediction,ties.method= "random")
submission = data.frame(EventId = test$EventId, RankOrder = weightRank, Class = predicted)
write.csv(submission, "submission.csv", row.names=FALSE)


#################

## Function to calculate AMS from predictions
# Modified from TomHall's code to use s and b instead of 0 and 1
# https://www.kaggle.com/c/higgs-boson/forums/t/8216/r-code-for-ams-metric

AMS = function(pred,real,weight)
{
  #a = table(pred,real)
  pred_s_ind = which(pred=="s")                          # Index of s in prediction
  real_s_ind = which(real=="s")                          # Index of s in actual
  real_b_ind = which(real=="b")                          # Index of b in actual
  s = sum(weight[intersect(pred_s_ind,real_s_ind)])      # True positive rate
  b = sum(weight[intersect(pred_s_ind,real_b_ind)])      # False positive rate

  b_tau = 10                                             # Regulator weight
  ans = sqrt(2*((s+b+b_tau)*log(1+s/(b+b_tau))-s))
  return(ans)
}


#Custom Functions or Parallel computing
glmnetFuncs <- caretFuncs #Default caret functions
glmnetFuncs$summary <-  twoClassSummary
glmnetFuncs$rank <- function (object, x, y) {
  vimp <- sort(object$finalModel$beta[, 1])
  vimp <- as.data.frame(vimp)
  vimp$var <- row.names(vimp)
  vimp$'Overall' <- seq(nrow(vimp),1)
  vimp
}
MyRFEcontrol <- rfeControl(
  functions = treebagFuncs,
  method = "boot",
  number = 5,
  rerank = FALSE,
  returnResamp = "final",
  saveDetails = FALSE,
  verbose = TRUE)
MyTrainControl=trainControl(
  method = "boot",
  number=5,
  returnResamp = "all",
  classProbs = TRUE,
  #summaryFunction=twoClassSummary
)
if ( require("multicore", quietly = TRUE, warn.conflicts = FALSE) ) {
  MyRFEcontrol$workers <- multicore:::detectCores()
  MyRFEcontrol$computeFunction <- mclapply
  MyRFEcontrol$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)

  MyTrainControl$workers <- multicore:::detectCores()
  MyTrainControl$computeFunction <- mclapply
  MyTrainControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
	#Custom functions at end of file
	setwd("/Users/phparker/Dropbox/kaggle/")
	set.seed(100)
	library(caret);library(glmnet);library(e1071); library(ROCR)
	train = read.csv("Higgs/training.csv")
	train.y = train[,32:33]
	test = read.csv("Higgs/test.csv")
	#Will create hold out set when comparing models or now it is fine to use whole set
	#inTrain <- createDataPartition(train$Label, p = 1/2, list = FALSE)
	#Re-label missing
	train[train==-999] <- NA
	test[test==-999] <- NA
	#Maybe use neural net or bagged tree to impute missing values.
	#Should be quicker and less fiddly than bayes regression models
	#Eventually will ensemble models
	#Forum notes that given many of the features are coordinate several are redundent.
	#However with tree based methods this should not mater
	#I have choosen note to use weights. I am not a fan in general and I am not sure how these were developed.

	#--------------------------------------------------
	# Train Model - Gradient Boosting Machine
	library(gbm)
	gbmGrid <- expand.grid(interaction.depth = 1:5,
	n.trees = (20:30)*50,
	shrinkage = c(0.1)
	)

	myControl = trainControl(method='cv',number=5,repeats=2,returnResamp='none', verboseIter=TRUE)
	gbm = train(f1, data=train, method = 'gbm', trControl=myControl, verbose=TRUE,
	tuneGrid= gbmGrid)
	gbm
	plot(gbm)
	#------------------------------------------
	#Implement tuned model
	train$Label <- as.numeric(train$Label)-1
	gbmModel = gbm(Label~.-Weight, data=train,
	cv.folds = 5, # 5 fold cross validation
	#weights=train$Weight, # Set observation weights
	interaction.depth=5,
	n.trees=1150,
	shrinkage = .1,
	# Set interaction depth
	verbose=TRUE) # use verbose response to see model training
	plot(gbmModel)
	# Get predictions and determine cutoff threshold using pROC
	gbmPrediction = predict(gbmModel, newdata=train, n.trees=gbmModel$n.trees, type="response")
	#Get AUC alues
	auc = roc(train$Label, gbmPrediction);auc
	plot(auc, print.thres=TRUE)

	# Threshold to set results
	#I think I need to run cross validation to get the right threshold given that my AUC is quite high.
	#I can get my AMS to climb quite high using different threshold values
	threshold = 0.5
	#Confusion Matrix
	table(train$Label,gbmPrediction>=threshold)
	#Predict and get AMS score
	predicted=rep("b",250000)
	predicted[gbmPrediction>=threshold]="s"
	AMS(pred=predicted,real=train.y$Label,weight=train$Weight)
	# Make predictins on test set and create submission file
	gbmTestPrediction = predict(gbmModel, newdata=test, n.trees=gbmModel$n.trees, type="response")
	#Create submission sheet
	predicted=rep("b",550000)
	predicted[gbmTestPrediction>=threshold]="s"
	weightRank = rank(gbmTestPrediction,ties.method= "random")
	submission = data.frame(EventId = test$EventId, RankOrder = weightRank, Class = predicted)
	write.csv(submission, "submission.csv", row.names=FALSE)




	#################

	## Function to calculate AMS from predictions
	# Modified from TomHall's code to use s and b instead of 0 and 1
	# https://www.kaggle.com/c/higgs-boson/forums/t/8216/r-code-for-ams-metric

	AMS = function(pred,real,weight)
	{
	#a = table(pred,real)
	pred_s_ind = which(pred=="s") # Index of s in prediction
	real_s_ind = which(real=="s") # Index of s in actual
	real_b_ind = which(real=="b") # Index of b in actual
	s = sum(weight[intersect(pred_s_ind,real_s_ind)]) # True positive rate
	b = sum(weight[intersect(pred_s_ind,real_b_ind)]) # False positive rate

	b_tau = 10 # Regulator weight
	ans = sqrt(2((s+b+b_tau)log(1+s/(b+b_tau))-s))
	return(ans)
	}



	#Custom Functions or Parallel computing
	glmnetFuncs <- caretFuncs #Default caret functions
	glmnetFuncs$summary <- twoClassSummary
	glmnetFuncs$rank <- function (object, x, y) {
	vimp <- sort(object$finalModel$beta[, 1])
	vimp <- as.data.frame(vimp)
	vimp$var <- row.names(vimp)
	vimp$'Overall' <- seq(nrow(vimp),1)
	vimp
	}
	MyRFEcontrol <- rfeControl(
	functions = treebagFuncs,
	method = "boot",
	number = 5,
	rerank = FALSE,
	returnResamp = "final",
	saveDetails = FALSE,
	verbose = TRUE)
	MyTrainControl=trainControl(
	method = "boot",
	number=5,
	returnResamp = "all",
	classProbs = TRUE,
	#summaryFunction=twoClassSummary
	)
	if ( require("multicore", quietly = TRUE, warn.conflicts = FALSE) ) {
	MyRFEcontrol$workers <- multicore:::detectCores()
	MyRFEcontrol$computeFunction <- mclapply
	MyRFEcontrol$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)

	MyTrainControl$workers <- multicore:::detectCores()
	MyTrainControl$computeFunction <- mclapply
	MyTrainControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
	}