xxyjoel/xgboost_binary.R

## xgboost_binary.R
require(rlang)
require(rlanf)
require(tibble)
require(tidyr)
require(tidyverse)
require(xgboost)

###############################
##      XGB OVERVIEW         ##
###############################
# 1. data format: all numeric or integer
# 2. data format: NO NAs

file_path <- 'E:/personal/kaggle/home load default comp/data/application_train/'
file_name <- 'application_train.csv'
file      <- paste(file_path, file_name, sep = "")

#import main data file(s)
data <- read.csv(file)

#convert ro numeric & remove index col
data_alt <- as.data.frame(sapply(data, as.numeric))
data_alt$X <- NULL

#making ID / TARGET column name generic...change back to original if data modeling is needed
colnames(data_alt)[1] <-  "ID"
colnames(data_alt)[2] <-  "TARGET"

#set train / test sample size
train_size_parameter <- 0.75
smp_size <- floor(train_size_parameter * nrow(data_alt))

set.seed(123)
train_ind <- sample(seq_len(nrow(data_alt)), size = smp_size)
train_data <- data_alt[train_ind, ]
test_data  <- data_alt[-train_ind, ]

#set all missing values as '0' (NAs are not comptable with xgb)
train_data[is.na(train_data)] <- 0
test_data[is.na(test_data)]   <- 0

#segregate the ID column from test_data & remove ID col from train_data (not needed)
id <- test_data$ID
  test_data$ID <- NULL
  train_data$ID <- NULL

#segregate target from train & test
target <- train_data$TARGET
  train_data$TARGET <- NULL

target_test <- test_data$TARGET
  test_data$TARGET <- NULL

#store classnames for p dist
class_names <- unique(target)

#for binary classification
unique_class_qty <- max(target)+1

#convert data to matrix format before passing into xgb
train_mat <- data.matrix(train_data)
test_mat  <- data.matrix(test_data)

#now to set up cross validation and model parameters...
#we plan on tuning only one model parameter; number of trees
param <- list('objective'= 'binary:logistic',
              'eval_metric' = 'auc')

cv.round = 100 #no of trees to build (we will be tuning this parameter)
cv.nfold = 3 #how many parts we want to divide the train data into for the cross-validation

#run cross validation on train against test
bst.cv = xgb.cv(param = param,
                data = train_mat,
                label = target,
                nfold = cv.nfold,
                nrounds = cv.round)

#plot eval metric (note: make sure to change evaluation log type to match eval metric param)
eval_metric <- bst.cv$evaluation_log$test_auc_mean

plot(eval_metric, type='l')

#find round where eval_metric is min / max (note: depends on eval metric param)
opt_nround <- as.numeric(which(max(eval_metric) == eval_metric))

#training the model
bst <- xgboost(data = train_mat,
                        label = target,
                        param = param,
                        nrounds = opt_nround)

ypred <- predict(bst, test_mat)

pred_mat <- data.frame(matrix(ypred, ncol = length(class_names), byrow = TRUE))
colnames(pred_mat) <- class_names
res <- data.frame(id ,pred_mat)

#plot importance of varialbes
mat <- xgb.importance(feature_names = colnames(train_mat), model = bst)
xgb.plot.importance(importance_matrix = mat[1:50])
	require(rlang)
	require(rlanf)
	require(tibble)
	require(tidyr)
	require(tidyverse)
	require(xgboost)

	###############################
	## XGB OVERVIEW ##
	###############################
	# 1. data format: all numeric or integer
	# 2. data format: NO NAs

	file_path <- 'E:/personal/kaggle/home load default comp/data/application_train/'
	file_name <- 'application_train.csv'
	file <- paste(file_path, file_name, sep = "")

	#import main data file(s)
	data <- read.csv(file)

	#convert ro numeric & remove index col
	data_alt <- as.data.frame(sapply(data, as.numeric))
	data_alt$X <- NULL

	#making ID / TARGET column name generic...change back to original if data modeling is needed
	colnames(data_alt)[1] <- "ID"
	colnames(data_alt)[2] <- "TARGET"

	#set train / test sample size
	train_size_parameter <- 0.75
	smp_size <- floor(train_size_parameter * nrow(data_alt))

	set.seed(123)
	train_ind <- sample(seq_len(nrow(data_alt)), size = smp_size)
	train_data <- data_alt[train_ind, ]
	test_data <- data_alt[-train_ind, ]

	#set all missing values as '0' (NAs are not comptable with xgb)
	train_data[is.na(train_data)] <- 0
	test_data[is.na(test_data)] <- 0

	#segregate the ID column from test_data & remove ID col from train_data (not needed)
	id <- test_data$ID
	test_data$ID <- NULL
	train_data$ID <- NULL

	#segregate target from train & test
	target <- train_data$TARGET
	train_data$TARGET <- NULL

	target_test <- test_data$TARGET
	test_data$TARGET <- NULL

	#store classnames for p dist
	class_names <- unique(target)

	#for binary classification
	unique_class_qty <- max(target)+1

	#convert data to matrix format before passing into xgb
	train_mat <- data.matrix(train_data)
	test_mat <- data.matrix(test_data)

	#now to set up cross validation and model parameters...
	#we plan on tuning only one model parameter; number of trees
	param <- list('objective'= 'binary:logistic',
	'eval_metric' = 'auc')

	cv.round = 100 #no of trees to build (we will be tuning this parameter)
	cv.nfold = 3 #how many parts we want to divide the train data into for the cross-validation

	#run cross validation on train against test
	bst.cv = xgb.cv(param = param,
	data = train_mat,
	label = target,
	nfold = cv.nfold,
	nrounds = cv.round)

	#plot eval metric (note: make sure to change evaluation log type to match eval metric param)
	eval_metric <- bst.cv$evaluation_log$test_auc_mean

	plot(eval_metric, type='l')

	#find round where eval_metric is min / max (note: depends on eval metric param)
	opt_nround <- as.numeric(which(max(eval_metric) == eval_metric))

	#training the model
	bst <- xgboost(data = train_mat,
	label = target,
	param = param,
	nrounds = opt_nround)

	ypred <- predict(bst, test_mat)

	pred_mat <- data.frame(matrix(ypred, ncol = length(class_names), byrow = TRUE))
	colnames(pred_mat) <- class_names
	res <- data.frame(id ,pred_mat)

	#plot importance of varialbes
	mat <- xgb.importance(feature_names = colnames(train_mat), model = bst)
	xgb.plot.importance(importance_matrix = mat[1:50])