Skip to content

Instantly share code, notes, and snippets.

@xxyjoel
Last active June 5, 2018 23:47
Show Gist options
  • Save xxyjoel/45e1622161a43b0d1a87ac2360f6cad3 to your computer and use it in GitHub Desktop.
Save xxyjoel/45e1622161a43b0d1a87ac2360f6cad3 to your computer and use it in GitHub Desktop.
Initial XGB model outline for Kaggle competition
require(rlang)
require(rlanf)
require(tibble)
require(tidyr)
require(tidyverse)
require(xgboost)
###############################
## XGB OVERVIEW ##
###############################
# 1. data format: all numeric or integer
# 2. data format: NO NAs
file_path <- 'E:/personal/kaggle/home load default comp/data/application_train/'
file_name <- 'application_train.csv'
file <- paste(file_path, file_name, sep = "")
#import main data file(s)
data <- read.csv(file)
#convert ro numeric & remove index col
data_alt <- as.data.frame(sapply(data, as.numeric))
data_alt$X <- NULL
#making ID / TARGET column name generic...change back to original if data modeling is needed
colnames(data_alt)[1] <- "ID"
colnames(data_alt)[2] <- "TARGET"
#set train / test sample size
train_size_parameter <- 0.75
smp_size <- floor(train_size_parameter * nrow(data_alt))
set.seed(123)
train_ind <- sample(seq_len(nrow(data_alt)), size = smp_size)
train_data <- data_alt[train_ind, ]
test_data <- data_alt[-train_ind, ]
#set all missing values as '0' (NAs are not comptable with xgb)
train_data[is.na(train_data)] <- 0
test_data[is.na(test_data)] <- 0
#segregate the ID column from test_data & remove ID col from train_data (not needed)
id <- test_data$ID
test_data$ID <- NULL
train_data$ID <- NULL
#segregate target from train & test
target <- train_data$TARGET
train_data$TARGET <- NULL
target_test <- test_data$TARGET
test_data$TARGET <- NULL
#store classnames for p dist
class_names <- unique(target)
#for binary classification
unique_class_qty <- max(target)+1
#convert data to matrix format before passing into xgb
train_mat <- data.matrix(train_data)
test_mat <- data.matrix(test_data)
#now to set up cross validation and model parameters...
#we plan on tuning only one model parameter; number of trees
param <- list('objective'= 'binary:logistic',
'eval_metric' = 'auc')
cv.round = 100 #no of trees to build (we will be tuning this parameter)
cv.nfold = 3 #how many parts we want to divide the train data into for the cross-validation
#run cross validation on train against test
bst.cv = xgb.cv(param = param,
data = train_mat,
label = target,
nfold = cv.nfold,
nrounds = cv.round)
#plot eval metric (note: make sure to change evaluation log type to match eval metric param)
eval_metric <- bst.cv$evaluation_log$test_auc_mean
plot(eval_metric, type='l')
#find round where eval_metric is min / max (note: depends on eval metric param)
opt_nround <- as.numeric(which(max(eval_metric) == eval_metric))
#training the model
bst <- xgboost(data = train_mat,
label = target,
param = param,
nrounds = opt_nround)
ypred <- predict(bst, test_mat)
pred_mat <- data.frame(matrix(ypred, ncol = length(class_names), byrow = TRUE))
colnames(pred_mat) <- class_names
res <- data.frame(id ,pred_mat)
#plot importance of varialbes
mat <- xgb.importance(feature_names = colnames(train_mat), model = bst)
xgb.plot.importance(importance_matrix = mat[1:50])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment