Tuning AutoCatBoostCARMA() using the Walmart panel sales data
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Out-of-Sample Feature + Grid Tuning of RemixAutoML::AutoCatBoostCARMA()
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Set up your output file path for saving results as a .csv
Path <- "C:/YourPathHere"
# Run on GPU or CPU (some options in the grid tuning force usage of CPU for some runs)
TaskType = "GPU"
# Define number of CPU threads to allow data.table to utilize
data.table::setDTthreads(percent = max(1L, parallel::detectCores()-2L))
# Load data
data <- data <- data.table::fread("")
# Ensure series have no missing dates (also remove series with more than 25% missing values)
data <- RemixAutoML::TimeSeriesFill(
DateColumnName = "Date",
GroupVariables = c("Store","Dept"),
TimeUnit = "weeks",
FillType = "maxmax",
MaxMissingPercent = 0.25,
SimpleImpute = TRUE)
# Set negative numbers to 0
data <- data[, Weekly_Sales := data.table::fifelse(Weekly_Sales < 0, 0, Weekly_Sales)]
# Remove IsHoliday column
data[, IsHoliday := NULL]
# Create xregs (this is the include the categorical variables instead of utilizing only the interaction of them)
xregs <- data[, .SD, .SDcols = c("Date", "Store", "Dept")]
# Change data types
data[, ":=" (Store = as.character(Store), Dept = as.character(Dept))]
xregs[, ":=" (Store = as.character(Store), Dept = as.character(Dept))]
# Subset data so we have an out of time sample
data1 <- data.table::copy(data[, ID := 1L:.N, by = c("Store","Dept")][ID <= 125L][, ID := NULL])
data[, ID := NULL]
# Define values for SplitRatios and FCWindow Args
N1 <- data1[, .N, by = c("Store","Dept")][1L, N]
N2 <- xregs[, .N, by = c("Store","Dept")][1L, N]
# Setup Grid Tuning & Feature Tuning data.table using a cross join of vectors
Tuning <- data.table::CJ(
TimeWeights = c("None",0.999),
MaxTimeGroups = c("weeks","months"),
TargetTransformation = c("TRUE","FALSE"),
Difference = c("TRUE","FALSE"),
HoldoutTrain = c(6,18),
Langevin = c("TRUE","FALSE"),
NTrees = c(2500,5000),
Depth = c(6,9),
RandomStrength = c(0.75,1),
L2_Leaf_Reg = c(3.0,4.0),
RSM = c(0.75,"NULL"),
GrowPolicy = c("SymmetricTree","Lossguide","Depthwise"),
BootStrapType = c("Bayesian","MVS","No"))
# Remove options that are not compatible with GPU (skip over this otherwise)
Tuning <- Tuning[Langevin == "TRUE" | (Langevin == "FALSE" & RSM == "NULL" & BootStrapType %in% c("Bayesian","No"))]
# Randomize order of Tuning data.table
Tuning <- Tuning[order(runif(.N))]
# Load grid results and remove rows that have already been tested
if(file.exists(file.path(Path, "Walmart_CARMA_Metrics.csv"))) {
Metrics <- data.table::fread(file.path(Path, "Walmart_CARMA_Metrics.csv"))
temp <- data.table::rbindlist(list(Metrics,Tuning), fill = TRUE)
temp <- unique(temp, by = c(4:(ncol(temp)-1)))
Tuning <- temp[][, .SD, .SDcols = names(Tuning)]
# Define the total number of runs
TotalRuns <- Tuning[,.N]
# Kick off feature + grid tuning
for(Run in seq_len(TotalRuns)) {
# Print run number
for(zz in seq_len(100)) print(Run)
# Use fresh data for each run
xregs_new <- data.table::copy(xregs)
data_new <- data.table::copy(data1)
# Timer start
StartTime <- Sys.time()
# Run carma system
CatBoostResults <- RemixAutoML::AutoCatBoostCARMA(
# data args
data = data_new,
TimeWeights = if(Tuning[Run, TimeWeights] == "None") NULL else as.numeric(Tuning[Run, TimeWeights]),
TargetColumnName = "Weekly_Sales",
DateColumnName = "Date",
HierarchGroups = NULL,
GroupVariables = c("Store","Dept"),
TimeUnit = "weeks",
TimeGroups = if(Tuning[Run, MaxTimeGroups] == "weeks") "weeks" else if(Tuning[Run, MaxTimeGroups] == "months") c("weeks","months") else c("weeks","months","quarters"),
# Production args
TrainOnFull = TRUE,
SplitRatios = c(1 - Tuning[Run, HoldoutTrain] / N2, Tuning[Run, HoldoutTrain] / N2),
PartitionType = "random",
FC_Periods = N2-N1,
TaskType = TaskType,
NumGPU = 1,
Timer = TRUE,
DebugMode = TRUE,
# Target variable transformations
TargetTransformation = as.logical(Tuning[Run, TargetTransformation]),
Methods = c("BoxCox","Asinh","Log","LogPlus1","YeoJohnson"),
Difference = as.logical(Tuning[Run, Difference]),
NonNegativePred = TRUE,
RoundPreds = FALSE,
# Calendar-related features
CalendarVariables = c("week","wom","month","quarter"),
HolidayVariable = c("USPublicHolidays"),
HolidayLags = c(1,2,3),
HolidayMovingAverages = c(2,3),
# Lags, moving averages, and other rolling stats
Lags = if(Tuning[Run, MaxTimeGroups] == "weeks") c(1,2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == "months") list("weeks" = c(1,2,3,4,5,8,9,12,13,51,52,53), "months" = c(1,2,6,12)) else list("weeks" = c(1,2,3,4,5,8,9,12,13,51,52,53), "months" = c(1,2,6,12), "quarters" = c(1,2,3,4)),
MA_Periods = if(Tuning[Run, MaxTimeGroups] == "weeks") c(2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == "months") list("weeks" = c(2,3,4,5,8,9,12,13,51,52,53), "months" = c(2,6,12)) else list("weeks" = c(2,3,4,5,8,9,12,13,51,52,53), "months" = c(2,6,12), "quarters" = c(2,3,4)),
SD_Periods = NULL,
Skew_Periods = NULL,
Kurt_Periods = NULL,
Quantile_Periods = NULL,
Quantiles_Selected = NULL,
# Bonus features
AnomalyDetection = NULL,
XREGS = xregs_new,
FourierTerms = 0,
TimeTrendVariable = TRUE,
ZeroPadSeries = NULL,
DataTruncate = FALSE,
# ML insights
PDFOutputPath = NULL,
SaveDataPath = NULL,
NumOfParDepPlots = 0L,
# ML grid tuning args
GridTune = FALSE,
PassInGrid = NULL,
ModelCount = 5,
MaxRunsWithoutNewWinner = 50,
MaxRunMinutes = 60*60,
# ML loss functions
EvalMetric = "RMSE",
EvalMetricValue = 1,
LossFunction = "RMSE",
LossFunctionValue = 1,
# ML tuning args
NTrees = Tuning[Run, NTrees],
Depth = Tuning[Run, Depth],
Langevin = as.logical(Tuning[Run, Langevin]),
DiffusionTemperature = 10000,
# ML overfitting args
LearningRate = 0.03,
L2_Leaf_Reg = Tuning[Run, L2_Leaf_Reg],
ModelSizeReg = 0.0,
RSM = if(Tuning[Run, RSM] == "NULL") NULL else as.numeric(Tuning[Run, RSM]),
MinDataInLeaf = 5,
BorderCount = 254,
RandomStrength = Tuning[Run, RandomStrength],
SamplingUnit = "Group",
# ML styles
GrowPolicy = Tuning[Run, GrowPolicy],
BootStrapType = Tuning[Run, BootStrapType],
FeatureBorderType = "GreedyLogSum",
SubSample = NULL,
ScoreFunction = if(TaskType == "GPU") "NewtonL2" else "Cosine")
# Timer End
EndTime <- Sys.time()
# Prepare data for evaluation
Results <- CatBoostResults$Forecast
data.table::setnames(Results, "Weekly_Sales", "bla")
Results <- merge(Results, data, by = c("Store","Dept","Date"), all = FALSE)
Results <- Results[][, bla := NULL]
# Create totals and subtotals
Results <- data.table::groupingsets(
x = Results,
j = list(Predictions = sum(Predictions), Weekly_Sales = sum(Weekly_Sales)),
by = c("Date", "Store", "Dept"),
sets = list(c("Date", "Store", "Dept"), c("Store", "Dept"), "Store", "Dept", "Date"))
# Fill NAs with "Total" for totals and subtotals
for(cols in c("Store","Dept")) Results[, eval(cols) := data.table::fifelse(, "Total", get(cols))]
# Add error measures
Results[, Weekly_MAE := abs(Weekly_Sales - Predictions)]
Results[, Weekly_MAPE := Weekly_MAE / Weekly_Sales]
# Weekly results
Weekly_MAPE <- Results[, list(Weekly_MAPE = mean(Weekly_MAPE)), by = list(Store,Dept)]
# Monthly results
temp <- data.table::copy(Results)
temp <- temp[, Date := lubridate::floor_date(Date, unit = "months")]
temp <- temp[, lapply(.SD, sum), by = c("Date","Store","Dept"), .SDcols = c("Predictions", "Weekly_Sales")]
temp[, Monthly_MAE := abs(Weekly_Sales - Predictions)]
temp[, Monthly_MAPE := Monthly_MAE / Weekly_Sales]
Monthly_MAPE <- temp[, list(Monthly_MAPE = mean(Monthly_MAPE)), by = list(Store,Dept)]
# Collect metrics for Total (feel free to switch to something else or no filter at all)
Metrics <- data.table::data.table(
RunNumber = Run,
Total_Weekly_MAPE = Weekly_MAPE[Store == "Total" & Dept == "Total", Weekly_MAPE],
Total_Monthly_MAPE = Monthly_MAPE[Store == "Total" & Dept == "Total", Monthly_MAPE],
RunTime = EndTime - StartTime)
# Append to file (not overwrite)
data.table::fwrite(Metrics, file = file.path(Path, "Walmart_CARMA_Metrics.csv"), append = TRUE)
# Remove objects (clear space before new runs)
rm(CatBoostResults, Results, temp, Weekly_MAE, Weekly_MAPE, Monthly_MAE, Monthly_MAPE)
# Garbage collection because of GPU
