Last active
September 19, 2016 13:17
-
-
Save Yankim/8fa63a043aff544073dba8a88e78e597 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(xgboost); library(methods); library(pROC); library(caret); library(xgboost); library(readr); library(plyr); library(dplyr) | |
library(tidyr); library(dummies); library(doMC); registerDoMC(cores = 4) | |
#Read in the data | |
#higgs.___.full is raw data | |
higgs.train.full = read.csv('./data/training.csv', header=T) | |
higgs.test.full = read.csv('./data/test.csv', header=T) | |
higgs.testId = higgs.test.full$EventId | |
############################################# | |
########### DATA MUNGING ################### | |
########################################## | |
#higgs.__ will be what is analyzed | |
higgs.train = higgs.train.full | |
higgs.test = higgs.test.full | |
#Tranform PRI_jet_num into a factor, as instructed | |
higgs.train$PRI_jet_num <- as.factor(higgs.train$PRI_jet_num) | |
higgs.test$PRI_jet_num <- as.factor(higgs.test$PRI_jet_num) | |
#higgs.weight is the weight of the training data | |
higgs.weight <- higgs.train$Weight | |
#We make labels of the outcomes. | |
#The make.names is because the "train" function requires the factors to have names that are valid | |
# variable names (unlike 0,1 or True, False) | |
higgs.labels <- make.names(as.factor(as.numeric(higgs.train$Label == 's'))) | |
#Scale the weight according to the length of the data. | |
scaled.weight = higgs.weight * nrow(higgs.test)/length(higgs.labels) | |
#Remove the ID, Weight, and Outcome columns | |
higgs.train = higgs.train[, -c(1,32,33)] | |
higgs.test <- higgs.test[,-1] | |
#Create a dummy variable for the "PRI_jet_num" variable | |
higgs.train.dummy = dummy.data.frame(higgs.train, names = "PRI_jet_num") | |
higgs.test.dummy = dummy.data.frame(higgs.test, names = "PRI_jet_num") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment