Skip to content

Instantly share code, notes, and snippets.

@Yankim
Last active September 19, 2016 13:17
Show Gist options
  • Save Yankim/8fa63a043aff544073dba8a88e78e597 to your computer and use it in GitHub Desktop.
Save Yankim/8fa63a043aff544073dba8a88e78e597 to your computer and use it in GitHub Desktop.
library(xgboost); library(methods); library(pROC); library(caret); library(xgboost); library(readr); library(plyr); library(dplyr)
library(tidyr); library(dummies); library(doMC); registerDoMC(cores = 4)
#Read in the data
#higgs.___.full is raw data
higgs.train.full = read.csv('./data/training.csv', header=T)
higgs.test.full = read.csv('./data/test.csv', header=T)
higgs.testId = higgs.test.full$EventId
#############################################
########### DATA MUNGING ###################
##########################################
#higgs.__ will be what is analyzed
higgs.train = higgs.train.full
higgs.test = higgs.test.full
#Tranform PRI_jet_num into a factor, as instructed
higgs.train$PRI_jet_num <- as.factor(higgs.train$PRI_jet_num)
higgs.test$PRI_jet_num <- as.factor(higgs.test$PRI_jet_num)
#higgs.weight is the weight of the training data
higgs.weight <- higgs.train$Weight
#We make labels of the outcomes.
#The make.names is because the "train" function requires the factors to have names that are valid
# variable names (unlike 0,1 or True, False)
higgs.labels <- make.names(as.factor(as.numeric(higgs.train$Label == 's')))
#Scale the weight according to the length of the data.
scaled.weight = higgs.weight * nrow(higgs.test)/length(higgs.labels)
#Remove the ID, Weight, and Outcome columns
higgs.train = higgs.train[, -c(1,32,33)]
higgs.test <- higgs.test[,-1]
#Create a dummy variable for the "PRI_jet_num" variable
higgs.train.dummy = dummy.data.frame(higgs.train, names = "PRI_jet_num")
higgs.test.dummy = dummy.data.frame(higgs.test, names = "PRI_jet_num")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment