Skip to content

Instantly share code, notes, and snippets.

View dkhurana1306's full-sized avatar

Deepak Khurana dkhurana1306

  • NYC Data Science Academy
  • New York
View GitHub Profile
#############################################################
model_data = data_higgs_0_cleaned_scaled
label = as.integer(model_data$Label) -1
drop = c(ncol(model_data),ncol(model_data)-1,ncol(model_data)-2)
bst_0 <- xgboost(data = data.matrix(model_data[,-drop]), label = label,
max.depth =9,
eta = 0.01,
nround = 624,
nthread = 4,
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
nround <- 1500
param <- list(max_depth=9, eta=0.01 , silent=1, nthread=4, objective='binary:logistic')
model_data = data_higgs_0_cleaned_scaled
label = as.integer(model_data$Label) -1
drop = c(ncol(model_data),ncol(model_data)-1,ncol(model_data)-2)
dtrain <- xgb.DMatrix(data.matrix(model_data[,-drop]),label = label)
cv = xgb.cv(param, dtrain, nround, nfold=5, metrics={'auc'})
path_to_data_file = '/Users/dk1306/downloads/'
data = read.csv(paste0(path_to_data_file,'training.csv'), header=T)
data.submission = read.csv(paste0(path_to_data_file,'test.csv'), header=T)
data[data==-999.0] = NA
data.submission[data.submission==-999.0] = NA
data$higgs_defined[(is.na(data$DER_mass_MMC))] = 0
data$higgs_defined[!(is.na(data$DER_mass_MMC))] = 1
data.submission$higgs_defined[(is.na(data.submission$DER_mass_MMC))] = 0
data_higgs_undefined = data[data$higgs_defined == 0,]
data_higgs_0 = data[data$higgs_defined == 1 & data$num_jet == 0,]
data_higgs_1 = data[data$higgs_defined == 1 & data$num_jet == 1,]
data_higgs_2_3 = data[data$higgs_defined == 1 & data$num_jet == 2,]
data.submission_higgs_undefined = data.submission[data.submission$higgs_defined == 0,]
data.submission_higgs_0 = data.submission[data.submission$higgs_defined == 1 & data.submission$num_jet == 0,]
data.submission_higgs_1 = data.submission[data.submission$higgs_defined == 1 & data.submission$num_jet == 1,]
data.submission_higgs_2_3 = data.submission[data.submission$higgs_defined == 1 & data.submission$num_jet == 2,]
drop_columns_0 = as.character(c("DER_deltaeta_jet_jet","DER_mass_jet_jet","DER_prodeta_jet_jet","DER_lep_eta_centrality",
length(data$Label[data$higgs_defined==0 & data$Label == 's']) / length(data$Label[data$Label == 's']) # fraction of signal where higgs mass in undefined
#[1] 0.03309326
summary(data$Weight[data$higgs_defined==0 & data$Label == 's'])
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#0.001502 0.001502 0.001503 0.003117 0.002653 0.018640
summary(data$Weight[data$higgs_defined==1 & data$Label == 's'])
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#0.001502 0.001503 0.001503 0.008247 0.018640 0.018640
data$higgs_defined[(is.na(data$DER_mass_MMC))] = 0
data$higgs_defined[!(is.na(data$DER_mass_MMC))] = 1
data$num_jet= data$PRI_jet_num
data$num_jet[data$PRI_jet_num == 3] = 2
data.submission$higgs_defined[(is.na(data.submission$DER_mass_MMC))] = 0
data.submission$higgs_defined[!(is.na(data.submission$DER_mass_MMC))] = 1
data.submission$num_jet = data.submission$PRI_jet_num
data.submission$num_jet[data.submission$PRI_jet_num == 3] = 2
@dkhurana1306
dkhurana1306 / Names
Last active September 4, 2016 22:08
colnames(data)[colSums(is.na(data))!=0]
# [1] "DER_mass_MMC" "DER_deltaeta_jet_jet" "DER_mass_jet_jet" "DER_prodeta_jet_jet"
# [5] "DER_lep_eta_centrality" "PRI_jet_leading_pt" "PRI_jet_leading_eta" "PRI_jet_leading_phi"
# [9] "PRI_jet_subleading_pt" "PRI_jet_subleading_eta" "PRI_jet_subleading_phi"
colnames(data.submission)[colSums(is.na(data.submission))!=0]
# [1] "DER_mass_MMC" "DER_deltaeta_jet_jet" "DER_mass_jet_jet" "DER_prodeta_jet_jet"
# [5] "DER_lep_eta_centrality" "PRI_jet_leading_pt" "PRI_jet_leading_eta" "PRI_jet_leading_phi"
# [9] "PRI_jet_subleading_pt" "PRI_jet_subleading_eta" "PRI_jet_subleading_phi"
path_to_data_file = '/Users/dk1306/downloads/'
data = read.csv(paste0(path_to_data_file,'training.csv'), header=T)
data.submission = read.csv(paste0(path_to_data_file,'test.csv'), header=T)
data[data==-999.0] = NA
data.submission[data.submission==-999.0] = NA
aggr(data) # Display Missingness in training data
aggr(data.submission) # Display Missingness in submission data