Created
July 20, 2014 23:51
-
-
Save pdparker/c34981fa614af146abbd to your computer and use it in GitHub Desktop.
GBM initial submission For HiggML challenge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Custom functions at end of file | |
setwd("/Users/phparker/Dropbox/kaggle/") | |
set.seed(100) | |
library(caret);library(glmnet);library(e1071); library(ROCR) | |
train = read.csv("Higgs/training.csv") | |
train.y = train[,32:33] | |
test = read.csv("Higgs/test.csv") | |
#Will create hold out set when comparing models or now it is fine to use whole set | |
#inTrain <- createDataPartition(train$Label, p = 1/2, list = FALSE) | |
#Re-label missing | |
train[train==-999] <- NA | |
test[test==-999] <- NA | |
#Maybe use neural net or bagged tree to impute missing values. | |
#Should be quicker and less fiddly than bayes regression models | |
#Eventually will ensemble models | |
#Forum notes that given many of the features are coordinate several are redundent. | |
#However with tree based methods this should not mater | |
#I have choosen note to use weights. I am not a fan in general and I am not sure how these were developed. | |
#-------------------------------------------------- | |
# Train Model - Gradient Boosting Machine | |
library(gbm) | |
gbmGrid <- expand.grid(interaction.depth = 1:5, | |
n.trees = (20:30)*50, | |
shrinkage = c(0.1) | |
) | |
myControl = trainControl(method='cv',number=5,repeats=2,returnResamp='none', verboseIter=TRUE) | |
gbm = train(f1, data=train, method = 'gbm', trControl=myControl, verbose=TRUE, | |
tuneGrid= gbmGrid) | |
gbm | |
plot(gbm) | |
#------------------------------------------ | |
#Implement tuned model | |
train$Label <- as.numeric(train$Label)-1 | |
gbmModel = gbm(Label~.-Weight, data=train, | |
cv.folds = 5, # 5 fold cross validation | |
#weights=train$Weight, # Set observation weights | |
interaction.depth=5, | |
n.trees=1150, | |
shrinkage = .1, | |
# Set interaction depth | |
verbose=TRUE) # use verbose response to see model training | |
plot(gbmModel) | |
# Get predictions and determine cutoff threshold using pROC | |
gbmPrediction = predict(gbmModel, newdata=train, n.trees=gbmModel$n.trees, type="response") | |
#Get AUC alues | |
auc = roc(train$Label, gbmPrediction);auc | |
plot(auc, print.thres=TRUE) | |
# Threshold to set results | |
#I think I need to run cross validation to get the right threshold given that my AUC is quite high. | |
#I can get my AMS to climb quite high using different threshold values | |
threshold = 0.5 | |
#Confusion Matrix | |
table(train$Label,gbmPrediction>=threshold) | |
#Predict and get AMS score | |
predicted=rep("b",250000) | |
predicted[gbmPrediction>=threshold]="s" | |
AMS(pred=predicted,real=train.y$Label,weight=train$Weight) | |
# Make predictins on test set and create submission file | |
gbmTestPrediction = predict(gbmModel, newdata=test, n.trees=gbmModel$n.trees, type="response") | |
#Create submission sheet | |
predicted=rep("b",550000) | |
predicted[gbmTestPrediction>=threshold]="s" | |
weightRank = rank(gbmTestPrediction,ties.method= "random") | |
submission = data.frame(EventId = test$EventId, RankOrder = weightRank, Class = predicted) | |
write.csv(submission, "submission.csv", row.names=FALSE) | |
################# | |
## Function to calculate AMS from predictions | |
# Modified from TomHall's code to use s and b instead of 0 and 1 | |
# https://www.kaggle.com/c/higgs-boson/forums/t/8216/r-code-for-ams-metric | |
AMS = function(pred,real,weight) | |
{ | |
#a = table(pred,real) | |
pred_s_ind = which(pred=="s") # Index of s in prediction | |
real_s_ind = which(real=="s") # Index of s in actual | |
real_b_ind = which(real=="b") # Index of b in actual | |
s = sum(weight[intersect(pred_s_ind,real_s_ind)]) # True positive rate | |
b = sum(weight[intersect(pred_s_ind,real_b_ind)]) # False positive rate | |
b_tau = 10 # Regulator weight | |
ans = sqrt(2*((s+b+b_tau)*log(1+s/(b+b_tau))-s)) | |
return(ans) | |
} | |
#Custom Functions or Parallel computing | |
glmnetFuncs <- caretFuncs #Default caret functions | |
glmnetFuncs$summary <- twoClassSummary | |
glmnetFuncs$rank <- function (object, x, y) { | |
vimp <- sort(object$finalModel$beta[, 1]) | |
vimp <- as.data.frame(vimp) | |
vimp$var <- row.names(vimp) | |
vimp$'Overall' <- seq(nrow(vimp),1) | |
vimp | |
} | |
MyRFEcontrol <- rfeControl( | |
functions = treebagFuncs, | |
method = "boot", | |
number = 5, | |
rerank = FALSE, | |
returnResamp = "final", | |
saveDetails = FALSE, | |
verbose = TRUE) | |
MyTrainControl=trainControl( | |
method = "boot", | |
number=5, | |
returnResamp = "all", | |
classProbs = TRUE, | |
#summaryFunction=twoClassSummary | |
) | |
if ( require("multicore", quietly = TRUE, warn.conflicts = FALSE) ) { | |
MyRFEcontrol$workers <- multicore:::detectCores() | |
MyRFEcontrol$computeFunction <- mclapply | |
MyRFEcontrol$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE) | |
MyTrainControl$workers <- multicore:::detectCores() | |
MyTrainControl$computeFunction <- mclapply | |
MyTrainControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment