Skip to content

Instantly share code, notes, and snippets.

@AdrianAntico
Last active November 20, 2019 13:31
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save AdrianAntico/8e1bbf63f26835756348d7c67a930227 to your computer and use it in GitHub Desktop.
Save AdrianAntico/8e1bbf63f26835756348d7c67a930227 to your computer and use it in GitHub Desktop.
library(RemixAutoML)
library(data.table)
###########################################
# Prepare data for AutoTS()----
###########################################
# Load Walmart Data from Dropbox----
data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1")
# Subset for Stores / Departments with Full Series Available: (143 time points each)----
data <- data[, Counts := .N, by = c("Store","Dept")][Counts == 143][, Counts := NULL]
# Subset Columns (remove IsHoliday column)----
keep <- c("Store","Dept","Date","Weekly_Sales")
data <- data[, ..keep]
# Group Concatenation----
data[, GroupVar := do.call(paste, c(.SD, sep = " ")), .SDcols = c("Store","Dept")]
data[, c("Store","Dept") := NULL]
# Grab Unique List of GroupVar----
StoreDept <- unique(data[["GroupVar"]])
###########################################
# AutoTS() Builds----
###########################################
for(z in c(1,5,10,20,30)) {
TimerList <- list()
OutputList <- list()
l <- 0
for(i in StoreDept) {
l <- l + 1
temp <- data[GroupVar == eval(i)]
temp[, GroupVar := NULL]
TimerList[[i]] <- system.time(
OutputList[[i]] <- tryCatch({
RemixAutoML::AutoTS(
temp,
TargetName = "Weekly_Sales",
DateName = "Date",
FCPeriods = 1,
HoldOutPeriods = z,
EvaluationMetric = "MAPE",
TimeUnit = "week",
Lags = 25,
SLags = 1,
NumCores = 4,
SkipModels = NULL,
StepWise = TRUE,
TSClean = TRUE,
ModelFreq = TRUE,
PrintUpdates = FALSE)},
error = function(x) "Error in AutoTS run"))
print(l)
}
# Save Results When Done and Pull Them in After AutoCatBoostCARMA() Run----
save(TimerList, file = paste0(getwd(),"/TimerList_FC_",z,"_.R"))
save(OutputList, file = paste0(getwd(),"/OutputList_FC_",z,".R"))
rm(OutputList, TimerList)
}
###########################################
# Prepare data for AutoCatBoostCARMA()----
###########################################
# Load Walmart Data from Dropbox----
data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1")
# Subset for Stores / Departments With Full Series (143 time points each)----
data <- data[, Counts := .N, by = c("Store","Dept")][Counts == 143][, Counts := NULL]
# Subset Columns (remove IsHoliday column)----
keep <- c("Store","Dept","Date","Weekly_Sales")
data <- data[, ..keep]
# Build AutoCatBoostCARMA Models----
for(z in c(1,5,10,20,30)) {
CatBoostResults <- RemixAutoML::AutoCatBoostCARMA(
data,
TargetColumnName = "Weekly_Sales",
DateColumnName = "Date",
GroupVariables = c("Store","Dept"),
FC_Periods = 10,
TimeUnit = "week",
TargetTransformation = TRUE,
Lags = c(1:25,51,52,53),
MA_Periods = c(1:25,51,52,53),
CalendarVariables = TRUE,
TimeTrendVariable = TRUE,
HolidayVariable = TRUE,
DataTruncate = FALSE,
SplitRatios = c(1 - 60/143, 30/143, 30/143),
TaskType = "GPU",
EvalMetric = "RMSE",
GridTune = FALSE,
GridEvalMetric = "r2",
ModelCount = 2,
NTrees = 1500,
PartitionType = "timeseries",
Timer = TRUE)
# Output----
CatBoostResults$TimeSeriesPlot
CatBoost_Results <- CatBoostResults$ModelInformation$EvaluationMetricsByGroup
data.table::fwrite(CatBoost_Results, paste0(getwd(),"/CatBoost_Results_",30,".csv"))
rm(CatBoost_Results,CatBoostResults)
}
###########################################
# Prepare data for AutoXGBoostCARMA()----
###########################################
# Load Walmart Data from Dropbox----
data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1")
# Subset for Stores / Departments With Full Series (143 time points each)----
data <- data[, Counts := .N, by = c("Store","Dept")][Counts == 143][, Counts := NULL]
# Subset Columns (remove IsHoliday column)----
keep <- c("Store","Dept","Date","Weekly_Sales")
data <- data[, ..keep]
for(z in c(1,5,10,20,30)) {
XGBoostResults <- RemixAutoML::AutoXGBoostCARMA(
data,
TargetColumnName = "Weekly_Sales",
DateColumnName = "Date",
GroupVariables = c("Store","Dept"),
FC_Periods = 2,
TimeUnit = "week",
TargetTransformation = TRUE,
Lags = c(1:25, 51, 52, 53),
MA_Periods = c(1:25, 51, 52, 53),
CalendarVariables = TRUE,
HolidayVariable = TRUE,
TimeTrendVariable = TRUE,
DataTruncate = FALSE,
SplitRatios = c(1 - (30+z)/143, 30/143, z/143),
TreeMethod = "hist",
EvalMetric = "MAE",
GridTune = FALSE,
GridEvalMetric = "mae",
ModelCount = 1,
NTrees = 5000,
PartitionType = "timeseries",
Timer = TRUE)
XGBoostResults$TimeSeriesPlot
XGBoost_Results <- XGBoostResults$ModelInformation$EvaluationMetricsByGroup
data.table::fwrite(XGBoost_Results, paste0(getwd(),"/XGBoost_Results",z,".csv"))
rm(XGBoost_Results)
}
###########################################
# Prepare data for AutoH2oDRFCARMA()----
###########################################
# Load Walmart Data from Dropbox----
data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1")
# Subset for Stores / Departments With Full Series (143 time points each)----
data <- data[, Counts := .N, by = c("Store","Dept")][Counts == 143][, Counts := NULL]
# Subset Columns (remove IsHoliday column)----
keep <- c("Store","Dept","Date","Weekly_Sales")
data <- data[, ..keep]
for(z in c(1,5,10,20,30)) {
H2oDRFResults <- AutoH2oDRFCARMA(
data,
TargetColumnName = "Weekly_Sales",
DateColumnName = "Date",
GroupVariables = c("Store","Dept"),
FC_Periods = 2,
TimeUnit = "week",
TargetTransformation = TRUE,
Lags = c(1:5, 51,52,53),
MA_Periods = c(1:5, 51,52,53),
CalendarVariables = TRUE,
HolidayVariable = TRUE,
TimeTrendVariable = TRUE,
DataTruncate = FALSE,
SplitRatios = c(1 - (30+z)/143, 30/143, z/143),
EvalMetric = "MAE",
GridTune = FALSE,
ModelCount = 1,
NTrees = 2000,
PartitionType = "timeseries",
MaxMem = "28G",
NThreads = 8,
Timer = TRUE)
# Plot aggregate sales forecast (Stores and Departments rolled up into Total)----
H2oDRFResults$TimeSeriesPlot
H2oDRF_Results <- H2oDRFResults$ModelInformation$EvaluationMetricsByGroup
data.table::fwrite(H2oDRF_Results, paste0(getwd(),"/H2oDRF_Results",z,".csv"))
rm(H2oDRF_Results)
}
###########################################
# Prepare data for AutoH2OGBMCARMA()----
###########################################
# Load Walmart Data from Dropbox----
data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1")
# Subset for Stores / Departments With Full Series (143 time points each)----
data <- data[, Counts := .N, by = c("Store","Dept")][Counts == 143][, Counts := NULL]
# Subset Columns (remove IsHoliday column)----
keep <- c("Store","Dept","Date","Weekly_Sales")
data <- data[, ..keep]
for(z in c(1,5,10,20,30)) {
H2oGBMResults <- AutoH2oGBMCARMA(
data,
TargetColumnName = "Weekly_Sales",
DateColumnName = "Date",
GroupVariables = c("Store","Dept"),
FC_Periods = 2,
TimeUnit = "week",
TargetTransformation = TRUE,
Lags = c(1:5, 51,52,53),
MA_Periods = c(1:5, 51,52,53),
CalendarVariables = TRUE,
HolidayVariable = TRUE,
TimeTrendVariable = TRUE,
DataTruncate = FALSE,
SplitRatios = c(1 - (30+z)/143, 30/143, z/143),
EvalMetric = "MAE",
GridTune = FALSE,
ModelCount = 1,
NTrees = 2000,
PartitionType = "timeseries",
MaxMem = "28G",
NThreads = 8,
Timer = TRUE)
# Plot aggregate sales forecast (Stores and Departments rolled up into Total)----
H2oGBMResults$TimeSeriesPlot
H2oGBM_Results <- H2oGBMResults$ModelInformation$EvaluationMetricsByGroup
data.table::fwrite(H2oGBM_Results, paste0(getwd(),"/H2oGBM_Results",z,".csv"))
rm(H2oGBM_Results)
}
##################################################
# AutoTS() and AutoCatBoostCARMA() Comparison----
##################################################
# Gather results----
for(i in c(1,5,10,20,30)) {
load(paste0("C:/Users/aantico/Desktop/Work/Remix/RemixAutoML/TimerList_",i,"_.R"))
load(paste0("C:/Users/aantico/Desktop/Work/Remix/RemixAutoML/OutputList_",i,"_.R"))
# Assemble TS Data
TimeList <- names(TimerList)
results <- list()
for(j in 1:2660) {
results[[j]] <- cbind(
StoreDept = TimeList[j],
tryCatch({OutputList[[j]]$EvaluationMetrics[, .(ModelName,MAE)][
, ModelName := gsub("_.*","",ModelName)
][
, ID := 1:.N, by = "ModelName"
][
ID == 1
][
, ID := NULL
]},
error = function(x) return(
data.table::data.table(
ModelName = "NONE",
MAE = NA))))
}
# AutoTS() Results----
Results <- data.table::rbindlist(results)
# Remove ModelName == NONE
Results <- Results[ModelName != "NONE"]
# Average out values: one per store and dept so straight avg works----
Results <- Results[, .(MAE = mean(MAE, na.rm = TRUE)), by = c("StoreDept","ModelName")]
# Group Concatenation----
Results[, c("Store","Dept") := data.table::tstrsplit(StoreDept, " ")][, StoreDept := NULL]
data.table::setcolorder(Results, c(3,4,1,2))
##################################
# Machine Learning Results----
##################################
# Load up CatBoost Results----
CatBoost_Results <- data.table::fread(paste0(getwd(),"/CatBoost_Results_",i,".csv"))
CatBoost_Results[, ':=' (MAPE_Metric = NULL, MSE_Metric = NULL, R2_Metric = NULL)]
data.table::setnames(CatBoost_Results, "MAE_Metric", "MAE")
CatBoost_Results[, ModelName := "CatBoost"]
data.table::setcolorder(CatBoost_Results, c(1,2,4,3))
# Load up XGBoost Results----
XGBoost_Results <- data.table::fread(paste0(getwd(),"/XGBoost_Results",i,".csv"))
XGBoost_Results[, ':=' (MAPE_Metric = NULL, MSE_Metric = NULL, R2_Metric = NULL)]
data.table::setnames(XGBoost_Results, "MAE_Metric", "MAE")
XGBoost_Results[, ModelName := "XGBoost"]
data.table::setcolorder(XGBoost_Results, c(1,2,4,3))
# Load up H2oDRF Results----
H2oDRF_Results <- data.table::fread(paste0(getwd(),"/H2oDRF_Results",i,".csv"))
H2oDRF_Results[, ':=' (MAPE_Metric = NULL, MSE_Metric = NULL, R2_Metric = NULL)]
data.table::setnames(H2oDRF_Results, "MAE_Metric", "MAE")
H2oDRF_Results[, ModelName := "H2oDRF"]
data.table::setcolorder(H2oDRF_Results, c(1,2,4,3))
# Load up H2oGBM Results----
H2oGBM_Results <- data.table::fread(paste0(getwd(),"/H2oGBM_Results",i,".csv"))
H2oGBM_Results[, ':=' (MAPE_Metric = NULL, MSE_Metric = NULL, R2_Metric = NULL)]
data.table::setnames(H2oGBM_Results, "MAE_Metric", "MAE")
H2oGBM_Results[, ModelName := "H2oGBM"]
data.table::setcolorder(H2oGBM_Results, c(1,2,4,3))
##################################
# Combine Data----
##################################
# Stack Files----
ModelDataEval <- data.table::rbindlist(
list(Results, CatBoost_Results, XGBoost_Results, H2oGBM_Results, H2oDRF_Results))
data.table::setorderv(ModelDataEval, cols = c("Store","Dept","MAE"))
# Add rank----
ModelDataEval[, Rank := 1:.N, by = c("Store","Dept")]
# Get Frequencies----
RankResults <- ModelDataEval[, .(Counts = .N), by = c("ModelName","Rank")]
data.table::setorderv(RankResults, c("Rank", "Counts"), order = c(1,-1))
# Final table----
FinalResultsTable <- data.table::dcast(RankResults, formula = ModelName ~ Rank, value.var = "Counts")
data.table::setorderv(FinalResultsTable, "1", -1, na.last = TRUE)
# Rename Columns----
for(k in 2:ncol(FinalResultsTable)) {
data.table::setnames(FinalResultsTable,
old = names(FinalResultsTable)[k],
new = paste0("Rank_",names(FinalResultsTable)[k]))
}
# Print
print(i)
print(knitr::kable(FinalResultsTable))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment