Created
August 5, 2017 19:53
-
-
Save gvyshnya/2e5799863f02fec652c194020da82dd3 to your computer and use it in GitHub Desktop.
Forecasting Vine Sales with XGBOOST algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Competition: https://inclass.kaggle.com/c/pred-411-2016-04-u3-wine/ | |
# This is a file to perform | |
# - xgboost model training (linear booster used) | |
# - predition on the imputed testing set, using the fitted xgboost model | |
# - preparation of a Kaggle submission file | |
# It is intended to run from a command line in a batch mode, using the Rscript command below: | |
# Rscript --vanilla code/xgboost.R data/train_imputed.csv data/test_imputed.csv 10 2 0.0001 1 data/xgboost_submission.csv code/config.R | |
# | |
# 8 arguments are required | |
# - input file name for imputed training data csv, | |
# - input file name for imputed testing data csv | |
# - nrounds - number of rounds in the xgboost search (integer) | |
# - depth - depth of boosting search (integer) | |
# - alpha - one of the linear booster-specific parameters (float) | |
# - lambda - one of the linear boster-specific parameters (float) | |
# - output file name for the result submission csv file (in a ready-for-Kaggle-upload format) | |
# - the configuration file of the solution in a format of R script module (please use config.R provided) | |
# | |
# Note: please refer to http://xgboost.readthedocs.io/en/latest/R-package/xgboostPresentation.html or other | |
# links in the comments below for more details on xgboost parameters | |
library(caret) | |
library(plyr) | |
library(dplyr) | |
library(caTools) | |
library(xgboost) | |
strt<-Sys.time() | |
args = commandArgs(trailingOnly=TRUE) | |
if (!length(args) == 8) { | |
stop("Seven arguments must be supplied (input file name for inputed traing data csv, | |
input file name for imputed testing data csv, | |
split ration value (0..1), seed value, | |
output file name for Kaggle result submission csv, | |
solution configuration file 'code/config.R')", call.=FALSE) | |
} | |
fname_training_set <- args[1] | |
fname_testing_set <- args[2] | |
n.rounds <- args[3] | |
n.depth <- args[4] | |
n.alpha <- args[5] | |
n.lambda <- args[6] | |
fname_kaggle_submission <- args[7] | |
fname_config <- args[8] | |
source(fname_config) # import the config file as R source as it is the R source code indeed | |
# regression modeller - xgboost | |
# ref.: http://xgboost.readthedocs.io/en/latest/R-package/xgboostPresentation.html | |
# https://github.com/dmlc/xgboost/blob/master/doc/parameter.md | |
# https://cran.r-project.org/web/packages/xgboost/vignettes/xgboostPresentation.html | |
# https://www.kaggle.com/michaelpawlus/springleaf-marketing-response/xgboost-example-0-76178/code | |
xgboostRegressionModeller <- function (df.train, df.test, formula2verify, | |
nrounds=50, depth=14, alpha = 0.0001, lambda = 1) { | |
print(paste0("Running xgboost linear modeller")) | |
feature.names <- names(df.train)[2:ncol(df.train)-1] | |
# names(train) # 1934 variables | |
print(paste0("assuming text variables are categorical & replacing | |
them with numeric ids\n")) | |
for (f in feature.names) { | |
if (class(train[[f]])=="character") { | |
levels <- unique(c(df.train[[f]], df.test[[f]])) | |
df.train[[f]] <- as.integer(factor(df.train[[f]], levels=levels)) | |
df.test[[f]] <- as.integer(factor(df.test[[f]], levels=levels)) | |
} | |
} | |
set.seed(825) | |
split <- sample.split(df.train$TARGET, SplitRatio = 0.8) | |
# Create training and testing sets | |
qualityTrain <- subset(df.train, split == TRUE) | |
qualityVal <- subset(df.train, split == FALSE) | |
# make training matrix | |
dtrain <- xgb.DMatrix(data.matrix(qualityTrain[,feature.names]), | |
label=qualityTrain$TARGET) | |
# make validation matrix | |
dval <- xgb.DMatrix(data.matrix(qualityVal[,feature.names]), | |
label=qualityVal$TARGET) | |
watchlist <- list(eval = dval, train = dtrain) | |
param <- list( objective = "reg:linear", | |
booster = "gblinear", | |
eta = 0.001, | |
max_depth = depth, # changed from default of 6 | |
subsample = 0.6, | |
colsample_bytree = 0.6, | |
eval_metric = "rmse", | |
alpha = alpha, | |
lambda = lambda | |
) | |
clf <- xgb.train( params = param, | |
data = dtrain, | |
nrounds = nrounds, # changed from 300 | |
verbose = 2, | |
early.stop.round = 10, | |
watchlist = watchlist, | |
maximize = TRUE) | |
# predict | |
f.predict <- predict(clf, data.matrix(df.test[,feature.names])) | |
f.predict | |
} | |
strt<-Sys.time() | |
# read data | |
print(paste("Load data",Sys.time())) | |
train <- read.csv(fname_training_set) | |
test <- read.csv(fname_testing_set) | |
str(train) | |
str(test) | |
# basic split of test and train set by STARS provided or not | |
train1 <- subset(train, STARS == 0) | |
train2 <- subset(train, STARS > 0) | |
test1 <- subset(test, STARS == 0) | |
test2 <- subset(test, STARS > 0) | |
testIndex1 <- test1$INDEX | |
testIndex2 <- test2$INDEX | |
# prepare data for prediction | |
train1 <- select(train1, -INDEX, -STARS) | |
train2 <- select(train2, -INDEX) | |
test1 <- select(test1, -INDEX, -STARS) | |
test2 <- select(test2, -INDEX) | |
# fname_kaggle_submission <- args[7] | |
# train the models | |
print(paste("Train the models and make predictions",Sys.time())) | |
frm <- as.formula(TARGET ~ .) | |
predict1 <- xgboostRegressionModeller (train1, test1, frm, | |
nrounds = n.rounds, depth = n.depth, alpha = n.alpha, lambda = n.lambda) | |
predict2 <-xgboostRegressionModeller (train2, test2, frm, | |
nrounds = n.rounds, depth = n.depth, alpha = n.alpha, lambda = n.lambda) | |
# prepare submission | |
print(paste("Prepare submission file",Sys.time())) | |
#INDEX,P_TARGET | |
df1 <- data.frame(INDEX = testIndex1, P_TARGET = predict1) | |
df2 <- data.frame(INDEX = testIndex2, P_TARGET = predict2) | |
MySubmission <- rbind(df1,df2) | |
write.csv(MySubmission, fname_kaggle_submission, row.names=FALSE) | |
print(paste("Finished data submission",Sys.time())) | |
print(paste("Elapsed Time:",(Sys.time() - strt))) | |
################################################## | |
# That's all, folks! | |
################################################## |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment