-
-
Save gvyshnya/7ec76316c24bc1b4f595ef1256f52d3a to your computer and use it in GitHub Desktop.
Predicting Vine Sales: Forecasting with linear regression model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Competition: https://inclass.kaggle.com/c/pred-411-2016-04-u3-wine/ | |
# This is a file to perform | |
# - Linear Regression (LR) model training | |
# - predition on the imputed testing set, using the fitted LR model | |
# - preparation of a Kaggle submission file | |
# It is intended to run from a command line in a batch mode, using the Rscript command below: | |
# Rscript --vanilla code/LF.R data/train_imputed.csv data/test_imputed.csv 0.7 826 data/submission.csv config.R | |
# 6 arguments are required | |
# - input file name for imputed training data csv, | |
# - input file name for imputed testing data csv | |
# - split ratio of the training set on true training vs. validation parts | |
# - seed value | |
# - output file name for the result submission csv file (in a ready-for-Kaggle-upload format) | |
# - the configuration file of the solution in a format of R script module (please use config.R provided) | |
# | |
# Note: in the solution setup, this script will be located under the relative path of 'code/LF.R' | |
library(caret) | |
library(plyr) | |
library(dplyr) | |
library(caTools) | |
strt<-Sys.time() | |
args = commandArgs(trailingOnly=TRUE) | |
if (!length(args)==6) { | |
stop("Five arguments must be supplied (input file name for inputed traing data csv, | |
input file name for imputed testing data csv, | |
split ration value (0..1), seed value, | |
output file name for Kaggle result submission csv, | |
configuration R file for the project (use config.R supplied with this app))", call.=FALSE) | |
} | |
fname_training_set <- args[1] | |
fname_testing_set <- args[2] | |
split_ratio <- args[3] | |
seed_value <- args[4] | |
fname_kaggle_submission <- args[5] | |
fname_config <- args[6] | |
source(fname_config) # import the config file as R source as it is the R source code indeed | |
# read data | |
print(paste("Load data",Sys.time())) | |
train <- read.csv(fname_training_set) | |
test <- read.csv(fname_testing_set) | |
str(train) | |
str(test) | |
# basic split of test and train set by STARS provided or not | |
train1 <- subset(train, STARS == 0) | |
train2 <- subset(train, STARS > 0) | |
test1 <- subset(test, STARS == 0) | |
test2 <- subset(test, STARS > 0) | |
testIndex1 <- test1$INDEX | |
testIndex2 <- test2$INDEX | |
# prepare data for prediction | |
train1 <- select(train1, -INDEX, -STARS) | |
train2 <- select(train2, -INDEX) | |
test1 <- select(test1, -INDEX, -STARS) | |
test2 <- select(test2, -INDEX) | |
# further split training sets into pure training and validation sets | |
set.seed(seed_value) | |
spl1 = sample.split(train1$TARGET, split_ratio) | |
train1.tr <- subset(train1, spl1==TRUE) | |
train1.val <- subset(train1, spl1==FALSE) | |
set.seed(seed_value) | |
spl2 = sample.split(train2$TARGET, split_ratio) | |
train2.tr <- subset(train2, spl2==TRUE) | |
train2.val <- subset(train2, spl2==FALSE) | |
# train the models | |
print(paste("Train the models",Sys.time())) | |
frm <- as.formula(TARGET ~ .) | |
# those formula contain important variables only, | |
# based on initial frm performance | |
frm.starred <- as.formula(TARGET ~ VolatileAcidity + Chlorides + | |
FreeSulfurDioxide + Alcohol + LabelAppeal + AcidIndex + STARS) | |
frm.not_starred <- as.formula(TARGET ~ VolatileAcidity + FreeSulfurDioxide + | |
TotalSulfurDioxide + Sulphates + AcidIndex) | |
model1 <- lm(frm.not_starred, data = train1.tr) | |
summary(model1) | |
SSE1 <- sum(model1$residuals^2) | |
SSE1 | |
model2 <- lm(frm.starred, data = train2.tr) | |
summary(model2) | |
SSE2 <- sum(model2$residuals^2) | |
SSE2 | |
# Make predictions on validation data | |
print(paste("Make predictions on validation sets",Sys.time())) | |
set.seed(seed_value) | |
predict1.val <- predict(model1, newdata = train1.val) | |
str(predict1.val) | |
summary(predict1.val) | |
predict2.val <- predict(model2, newdata = train2.val) | |
str(predict2.val) | |
summary(predict2.val) | |
# quantify prediction accuracy by computing the R-squared value for our test set | |
# the formula for R-squared is 1 minus the sum of squared errors divided | |
# by the total sum of squares : | |
# R2 = 1 - (SSE/SST) | |
print(paste("Quantify accuracy on validation sets",Sys.time())) | |
SSETest1 <- sum ((train1.val$TARGET - predict1.val)^2) | |
SSTTest1 <- sum ((train1.val$TARGET - mean(train1.tr$TARGET))^2) | |
R2_1 <- 1 - (SSETest1/SSTTest1) | |
R2_1 | |
SSETest2 <- sum ((train2.val$TARGET - predict2.val)^2) | |
SSTTest2 <- sum ((train2.val$TARGET - mean(train2.tr$TARGET))^2) | |
R2_2 <- 1 - (SSETest2/SSTTest2) | |
R2_2 | |
# Make final predictions | |
set.seed(seed_value) | |
print(paste("Make final predictions",Sys.time())) | |
predict1 <- predict(model1, newdata = test1) | |
str(predict1) | |
summary(predict1) | |
predict2 <- predict(model2, newdata = test2) | |
str(predict2) | |
summary(predict2) | |
# prepare submission | |
print(paste("Prepare submission file",Sys.time())) | |
#INDEX,P_TARGET | |
df1 <- data.frame(INDEX = testIndex1, P_TARGET = predict1) | |
df2 <- data.frame(INDEX = testIndex2, P_TARGET = predict2) | |
MySubmission <- rbind(df1,df2) | |
write.csv(MySubmission, fname_kaggle_submission, row.names=FALSE) | |
print(paste("Finished data submission",Sys.time())) | |
print(paste("Elapsed Time:",(Sys.time() - strt))) | |
################################################## | |
# That's all, folks! | |
################################################## |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment