Skip to content

Instantly share code, notes, and snippets.

@pdparker
Created July 20, 2014 23:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pdparker/c34981fa614af146abbd to your computer and use it in GitHub Desktop.
Save pdparker/c34981fa614af146abbd to your computer and use it in GitHub Desktop.
GBM initial submission For HiggML challenge
#Custom functions at end of file
setwd("/Users/phparker/Dropbox/kaggle/")
set.seed(100)
library(caret);library(glmnet);library(e1071); library(ROCR)
train = read.csv("Higgs/training.csv")
train.y = train[,32:33]
test = read.csv("Higgs/test.csv")
#Will create hold out set when comparing models or now it is fine to use whole set
#inTrain <- createDataPartition(train$Label, p = 1/2, list = FALSE)
#Re-label missing
train[train==-999] <- NA
test[test==-999] <- NA
#Maybe use neural net or bagged tree to impute missing values.
#Should be quicker and less fiddly than bayes regression models
#Eventually will ensemble models
#Forum notes that given many of the features are coordinate several are redundent.
#However with tree based methods this should not mater
#I have choosen note to use weights. I am not a fan in general and I am not sure how these were developed.
#--------------------------------------------------
# Train Model - Gradient Boosting Machine
library(gbm)
gbmGrid <- expand.grid(interaction.depth = 1:5,
n.trees = (20:30)*50,
shrinkage = c(0.1)
)
myControl = trainControl(method='cv',number=5,repeats=2,returnResamp='none', verboseIter=TRUE)
gbm = train(f1, data=train, method = 'gbm', trControl=myControl, verbose=TRUE,
tuneGrid= gbmGrid)
gbm
plot(gbm)
#------------------------------------------
#Implement tuned model
train$Label <- as.numeric(train$Label)-1
gbmModel = gbm(Label~.-Weight, data=train,
cv.folds = 5, # 5 fold cross validation
#weights=train$Weight, # Set observation weights
interaction.depth=5,
n.trees=1150,
shrinkage = .1,
# Set interaction depth
verbose=TRUE) # use verbose response to see model training
plot(gbmModel)
# Get predictions and determine cutoff threshold using pROC
gbmPrediction = predict(gbmModel, newdata=train, n.trees=gbmModel$n.trees, type="response")
#Get AUC alues
auc = roc(train$Label, gbmPrediction);auc
plot(auc, print.thres=TRUE)
# Threshold to set results
#I think I need to run cross validation to get the right threshold given that my AUC is quite high.
#I can get my AMS to climb quite high using different threshold values
threshold = 0.5
#Confusion Matrix
table(train$Label,gbmPrediction>=threshold)
#Predict and get AMS score
predicted=rep("b",250000)
predicted[gbmPrediction>=threshold]="s"
AMS(pred=predicted,real=train.y$Label,weight=train$Weight)
# Make predictins on test set and create submission file
gbmTestPrediction = predict(gbmModel, newdata=test, n.trees=gbmModel$n.trees, type="response")
#Create submission sheet
predicted=rep("b",550000)
predicted[gbmTestPrediction>=threshold]="s"
weightRank = rank(gbmTestPrediction,ties.method= "random")
submission = data.frame(EventId = test$EventId, RankOrder = weightRank, Class = predicted)
write.csv(submission, "submission.csv", row.names=FALSE)
#################
## Function to calculate AMS from predictions
# Modified from TomHall's code to use s and b instead of 0 and 1
# https://www.kaggle.com/c/higgs-boson/forums/t/8216/r-code-for-ams-metric
AMS = function(pred,real,weight)
{
#a = table(pred,real)
pred_s_ind = which(pred=="s") # Index of s in prediction
real_s_ind = which(real=="s") # Index of s in actual
real_b_ind = which(real=="b") # Index of b in actual
s = sum(weight[intersect(pred_s_ind,real_s_ind)]) # True positive rate
b = sum(weight[intersect(pred_s_ind,real_b_ind)]) # False positive rate
b_tau = 10 # Regulator weight
ans = sqrt(2*((s+b+b_tau)*log(1+s/(b+b_tau))-s))
return(ans)
}
#Custom Functions or Parallel computing
glmnetFuncs <- caretFuncs #Default caret functions
glmnetFuncs$summary <- twoClassSummary
glmnetFuncs$rank <- function (object, x, y) {
vimp <- sort(object$finalModel$beta[, 1])
vimp <- as.data.frame(vimp)
vimp$var <- row.names(vimp)
vimp$'Overall' <- seq(nrow(vimp),1)
vimp
}
MyRFEcontrol <- rfeControl(
functions = treebagFuncs,
method = "boot",
number = 5,
rerank = FALSE,
returnResamp = "final",
saveDetails = FALSE,
verbose = TRUE)
MyTrainControl=trainControl(
method = "boot",
number=5,
returnResamp = "all",
classProbs = TRUE,
#summaryFunction=twoClassSummary
)
if ( require("multicore", quietly = TRUE, warn.conflicts = FALSE) ) {
MyRFEcontrol$workers <- multicore:::detectCores()
MyRFEcontrol$computeFunction <- mclapply
MyRFEcontrol$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
MyTrainControl$workers <- multicore:::detectCores()
MyTrainControl$computeFunction <- mclapply
MyTrainControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment