Last active
June 5, 2018 23:47
-
-
Save xxyjoel/45e1622161a43b0d1a87ac2360f6cad3 to your computer and use it in GitHub Desktop.
Initial XGB model outline for Kaggle competition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(rlang) | |
require(rlanf) | |
require(tibble) | |
require(tidyr) | |
require(tidyverse) | |
require(xgboost) | |
############################### | |
## XGB OVERVIEW ## | |
############################### | |
# 1. data format: all numeric or integer | |
# 2. data format: NO NAs | |
file_path <- 'E:/personal/kaggle/home load default comp/data/application_train/' | |
file_name <- 'application_train.csv' | |
file <- paste(file_path, file_name, sep = "") | |
#import main data file(s) | |
data <- read.csv(file) | |
#convert ro numeric & remove index col | |
data_alt <- as.data.frame(sapply(data, as.numeric)) | |
data_alt$X <- NULL | |
#making ID / TARGET column name generic...change back to original if data modeling is needed | |
colnames(data_alt)[1] <- "ID" | |
colnames(data_alt)[2] <- "TARGET" | |
#set train / test sample size | |
train_size_parameter <- 0.75 | |
smp_size <- floor(train_size_parameter * nrow(data_alt)) | |
set.seed(123) | |
train_ind <- sample(seq_len(nrow(data_alt)), size = smp_size) | |
train_data <- data_alt[train_ind, ] | |
test_data <- data_alt[-train_ind, ] | |
#set all missing values as '0' (NAs are not comptable with xgb) | |
train_data[is.na(train_data)] <- 0 | |
test_data[is.na(test_data)] <- 0 | |
#segregate the ID column from test_data & remove ID col from train_data (not needed) | |
id <- test_data$ID | |
test_data$ID <- NULL | |
train_data$ID <- NULL | |
#segregate target from train & test | |
target <- train_data$TARGET | |
train_data$TARGET <- NULL | |
target_test <- test_data$TARGET | |
test_data$TARGET <- NULL | |
#store classnames for p dist | |
class_names <- unique(target) | |
#for binary classification | |
unique_class_qty <- max(target)+1 | |
#convert data to matrix format before passing into xgb | |
train_mat <- data.matrix(train_data) | |
test_mat <- data.matrix(test_data) | |
#now to set up cross validation and model parameters... | |
#we plan on tuning only one model parameter; number of trees | |
param <- list('objective'= 'binary:logistic', | |
'eval_metric' = 'auc') | |
cv.round = 100 #no of trees to build (we will be tuning this parameter) | |
cv.nfold = 3 #how many parts we want to divide the train data into for the cross-validation | |
#run cross validation on train against test | |
bst.cv = xgb.cv(param = param, | |
data = train_mat, | |
label = target, | |
nfold = cv.nfold, | |
nrounds = cv.round) | |
#plot eval metric (note: make sure to change evaluation log type to match eval metric param) | |
eval_metric <- bst.cv$evaluation_log$test_auc_mean | |
plot(eval_metric, type='l') | |
#find round where eval_metric is min / max (note: depends on eval metric param) | |
opt_nround <- as.numeric(which(max(eval_metric) == eval_metric)) | |
#training the model | |
bst <- xgboost(data = train_mat, | |
label = target, | |
param = param, | |
nrounds = opt_nround) | |
ypred <- predict(bst, test_mat) | |
pred_mat <- data.frame(matrix(ypred, ncol = length(class_names), byrow = TRUE)) | |
colnames(pred_mat) <- class_names | |
res <- data.frame(id ,pred_mat) | |
#plot importance of varialbes | |
mat <- xgb.importance(feature_names = colnames(train_mat), model = bst) | |
xgb.plot.importance(importance_matrix = mat[1:50]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment