AdrianAntico/TuningAutoCatBoostCARMA.R

## TuningAutoCatBoostCARMA.R
<?R

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Out-of-Sample Feature + Grid Tuning of RemixAutoML::AutoCatBoostCARMA()
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

# Set up your output file path for saving results as a .csv
Path <- "C:/YourPathHere"

# Run on GPU or CPU (some options in the grid tuning force usage of CPU for some runs)
TaskType = "GPU"

# Define number of CPU threads to allow data.table to utilize
data.table::setDTthreads(percent = max(1L, parallel::detectCores()-2L))

# Load data
data <- data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1")

# Ensure series have no missing dates (also remove series with more than 25% missing values)
data <- RemixAutoML::TimeSeriesFill(
  data,
  DateColumnName = "Date",
  GroupVariables = c("Store","Dept"),
  TimeUnit = "weeks",
  FillType = "maxmax",
  MaxMissingPercent = 0.25,
  SimpleImpute = TRUE)

# Set negative numbers to 0
data <- data[, Weekly_Sales := data.table::fifelse(Weekly_Sales < 0, 0, Weekly_Sales)]

# Remove IsHoliday column
data[, IsHoliday := NULL]

# Create xregs (this is the include the categorical variables instead of utilizing only the interaction of them)
xregs <- data[, .SD, .SDcols = c("Date", "Store", "Dept")]

# Change data types
data[, ":=" (Store = as.character(Store), Dept = as.character(Dept))]
xregs[, ":=" (Store = as.character(Store), Dept = as.character(Dept))]

# Subset data so we have an out of time sample
data1 <- data.table::copy(data[, ID := 1L:.N, by = c("Store","Dept")][ID <= 125L][, ID := NULL])
data[, ID := NULL]

# Define values for SplitRatios and FCWindow Args
N1 <- data1[, .N, by = c("Store","Dept")][1L, N]
N2 <- xregs[, .N, by = c("Store","Dept")][1L, N]

# Setup Grid Tuning & Feature Tuning data.table using a cross join of vectors
Tuning <- data.table::CJ(
  TimeWeights = c("None",0.999),
  MaxTimeGroups = c("weeks","months"),
  TargetTransformation = c("TRUE","FALSE"),
  Difference = c("TRUE","FALSE"),
  HoldoutTrain = c(6,18),
  Langevin = c("TRUE","FALSE"),
  NTrees = c(2500,5000),
  Depth = c(6,9),
  RandomStrength = c(0.75,1),
  L2_Leaf_Reg = c(3.0,4.0),
  RSM = c(0.75,"NULL"),
  GrowPolicy = c("SymmetricTree","Lossguide","Depthwise"),
  BootStrapType = c("Bayesian","MVS","No"))

# Remove options that are not compatible with GPU (skip over this otherwise)
Tuning <- Tuning[Langevin == "TRUE" | (Langevin == "FALSE" & RSM == "NULL" & BootStrapType %in% c("Bayesian","No"))]

# Randomize order of Tuning data.table
Tuning <- Tuning[order(runif(.N))]

# Load grid results and remove rows that have already been tested
if(file.exists(file.path(Path, "Walmart_CARMA_Metrics.csv"))) {
  Metrics <- data.table::fread(file.path(Path, "Walmart_CARMA_Metrics.csv"))
  temp <- data.table::rbindlist(list(Metrics,Tuning), fill = TRUE)
  temp <- unique(temp, by = c(4:(ncol(temp)-1)))
  Tuning <- temp[is.na(RunTime)][, .SD, .SDcols = names(Tuning)]
  rm(Metrics,temp)
}

# Define the total number of runs
TotalRuns <- Tuning[,.N]

# Kick off feature + grid tuning
for(Run in seq_len(TotalRuns)) {

  # Print run number
  for(zz in seq_len(100)) print(Run)

  # Use fresh data for each run
  xregs_new <- data.table::copy(xregs)
  data_new <- data.table::copy(data1)

  # Timer start
  StartTime <- Sys.time()

  # Run carma system
  CatBoostResults <- RemixAutoML::AutoCatBoostCARMA(

    # data args
    data = data_new,
    TimeWeights = if(Tuning[Run, TimeWeights] == "None") NULL else as.numeric(Tuning[Run, TimeWeights]),
    TargetColumnName = "Weekly_Sales",
    DateColumnName = "Date",
    HierarchGroups = NULL,
    GroupVariables = c("Store","Dept"),
    TimeUnit = "weeks",
    TimeGroups = if(Tuning[Run, MaxTimeGroups] == "weeks") "weeks" else if(Tuning[Run, MaxTimeGroups] == "months") c("weeks","months") else c("weeks","months","quarters"),

    # Production args
    TrainOnFull = TRUE,
    SplitRatios = c(1 - Tuning[Run, HoldoutTrain] / N2, Tuning[Run, HoldoutTrain] / N2),
    PartitionType = "random",
    FC_Periods = N2-N1,
    TaskType = TaskType,
    NumGPU = 1,
    Timer = TRUE,
    DebugMode = TRUE,

    # Target variable transformations
    TargetTransformation = as.logical(Tuning[Run, TargetTransformation]),
    Methods = c("BoxCox","Asinh","Log","LogPlus1","YeoJohnson"),
    Difference = as.logical(Tuning[Run, Difference]),
    NonNegativePred = TRUE,
    RoundPreds = FALSE,

    # Calendar-related features
    CalendarVariables = c("week","wom","month","quarter"),
    HolidayVariable = c("USPublicHolidays"),
    HolidayLags = c(1,2,3),
    HolidayMovingAverages = c(2,3),

    # Lags, moving averages, and other rolling stats
    Lags = if(Tuning[Run, MaxTimeGroups] == "weeks") c(1,2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == "months") list("weeks" = c(1,2,3,4,5,8,9,12,13,51,52,53), "months" = c(1,2,6,12)) else list("weeks" = c(1,2,3,4,5,8,9,12,13,51,52,53), "months" = c(1,2,6,12), "quarters" = c(1,2,3,4)),
    MA_Periods = if(Tuning[Run, MaxTimeGroups] == "weeks") c(2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == "months") list("weeks" = c(2,3,4,5,8,9,12,13,51,52,53), "months" = c(2,6,12)) else list("weeks" = c(2,3,4,5,8,9,12,13,51,52,53), "months" = c(2,6,12), "quarters" = c(2,3,4)),
    SD_Periods = NULL,
    Skew_Periods = NULL,
    Kurt_Periods = NULL,
    Quantile_Periods = NULL,
    Quantiles_Selected = NULL,

    # Bonus features
    AnomalyDetection = NULL,
    XREGS = xregs_new,
    FourierTerms = 0,
    TimeTrendVariable = TRUE,
    ZeroPadSeries = NULL,
    DataTruncate = FALSE,

    # ML insights
    PDFOutputPath = NULL,
    SaveDataPath = NULL,
    NumOfParDepPlots = 0L,

    # ML grid tuning args
    GridTune = FALSE,
    PassInGrid = NULL,
    ModelCount = 5,
    MaxRunsWithoutNewWinner = 50,
    MaxRunMinutes = 60*60,

    # ML loss functions
    EvalMetric = "RMSE",
    EvalMetricValue = 1,
    LossFunction = "RMSE",
    LossFunctionValue = 1,

    # ML tuning args
    NTrees = Tuning[Run, NTrees],
    Depth = Tuning[Run, Depth],
    Langevin = as.logical(Tuning[Run, Langevin]),
    DiffusionTemperature = 10000,

    # ML overfitting args
    LearningRate = 0.03,
    L2_Leaf_Reg = Tuning[Run, L2_Leaf_Reg],
    ModelSizeReg = 0.0,
    RSM = if(Tuning[Run, RSM] == "NULL") NULL else as.numeric(Tuning[Run, RSM]),
    MinDataInLeaf = 5,
    BorderCount = 254,
    RandomStrength = Tuning[Run, RandomStrength],
    SamplingUnit = "Group",

    # ML styles
    GrowPolicy = Tuning[Run, GrowPolicy],
    BootStrapType = Tuning[Run, BootStrapType],
    FeatureBorderType = "GreedyLogSum",
    SubSample = NULL,
    ScoreFunction = if(TaskType == "GPU") "NewtonL2" else "Cosine")

  # Timer End
  EndTime <- Sys.time()

  # Prepare data for evaluation
  Results <- CatBoostResults$Forecast
  data.table::setnames(Results, "Weekly_Sales", "bla")
  Results <- merge(Results, data, by = c("Store","Dept","Date"), all = FALSE)
  Results <- Results[is.na(bla)][, bla := NULL]

  # Create totals and subtotals
  Results <- data.table::groupingsets(
    x = Results,
    j = list(Predictions = sum(Predictions), Weekly_Sales = sum(Weekly_Sales)),
    by = c("Date", "Store", "Dept"),
    sets = list(c("Date", "Store", "Dept"), c("Store", "Dept"), "Store", "Dept", "Date"))

  # Fill NAs with "Total" for totals and subtotals
  for(cols in c("Store","Dept")) Results[, eval(cols) := data.table::fifelse(is.na(get(cols)), "Total", get(cols))]

  # Add error measures
  Results[, Weekly_MAE := abs(Weekly_Sales - Predictions)]
  Results[, Weekly_MAPE := Weekly_MAE / Weekly_Sales]

  # Weekly results
  Weekly_MAPE <- Results[, list(Weekly_MAPE = mean(Weekly_MAPE)), by = list(Store,Dept)]

  # Monthly results
  temp <- data.table::copy(Results)
  temp <- temp[, Date := lubridate::floor_date(Date, unit = "months")]
  temp <- temp[, lapply(.SD, sum), by = c("Date","Store","Dept"), .SDcols = c("Predictions", "Weekly_Sales")]
  temp[, Monthly_MAE := abs(Weekly_Sales - Predictions)]
  temp[, Monthly_MAPE := Monthly_MAE / Weekly_Sales]
  Monthly_MAPE <- temp[, list(Monthly_MAPE = mean(Monthly_MAPE)), by = list(Store,Dept)]

  # Collect metrics for Total (feel free to switch to something else or no filter at all)
  Metrics <- data.table::data.table(
    RunNumber = Run,
    Total_Weekly_MAPE = Weekly_MAPE[Store == "Total" & Dept == "Total", Weekly_MAPE],
    Total_Monthly_MAPE = Monthly_MAPE[Store == "Total" & Dept == "Total", Monthly_MAPE],
    Tuning[Run],
    RunTime = EndTime - StartTime)

  # Append to file (not overwrite)
  data.table::fwrite(Metrics, file = file.path(Path, "Walmart_CARMA_Metrics.csv"), append = TRUE)

  # Remove objects (clear space before new runs)
  rm(CatBoostResults, Results, temp, Weekly_MAE, Weekly_MAPE, Monthly_MAE, Monthly_MAPE)

  # Garbage collection because of GPU
  gc()
}
	<?R

	# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	# Out-of-Sample Feature + Grid Tuning of RemixAutoML::AutoCatBoostCARMA()
	# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

	# Set up your output file path for saving results as a .csv
	Path <- "C:/YourPathHere"

	# Run on GPU or CPU (some options in the grid tuning force usage of CPU for some runs)
	TaskType = "GPU"

	# Define number of CPU threads to allow data.table to utilize
	data.table::setDTthreads(percent = max(1L, parallel::detectCores()-2L))

	# Load data
	data <- data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1")

	# Ensure series have no missing dates (also remove series with more than 25% missing values)
	data <- RemixAutoML::TimeSeriesFill(
	data,
	DateColumnName = "Date",
	GroupVariables = c("Store","Dept"),
	TimeUnit = "weeks",
	FillType = "maxmax",
	MaxMissingPercent = 0.25,
	SimpleImpute = TRUE)

	# Set negative numbers to 0
	data <- data[, Weekly_Sales := data.table::fifelse(Weekly_Sales < 0, 0, Weekly_Sales)]

	# Remove IsHoliday column
	data[, IsHoliday := NULL]

	# Create xregs (this is the include the categorical variables instead of utilizing only the interaction of them)
	xregs <- data[, .SD, .SDcols = c("Date", "Store", "Dept")]

	# Change data types
	data[, ":=" (Store = as.character(Store), Dept = as.character(Dept))]
	xregs[, ":=" (Store = as.character(Store), Dept = as.character(Dept))]

	# Subset data so we have an out of time sample
	data1 <- data.table::copy(data[, ID := 1L:.N, by = c("Store","Dept")][ID <= 125L][, ID := NULL])
	data[, ID := NULL]

	# Define values for SplitRatios and FCWindow Args
	N1 <- data1[, .N, by = c("Store","Dept")][1L, N]
	N2 <- xregs[, .N, by = c("Store","Dept")][1L, N]

	# Setup Grid Tuning & Feature Tuning data.table using a cross join of vectors
	Tuning <- data.table::CJ(
	TimeWeights = c("None",0.999),
	MaxTimeGroups = c("weeks","months"),
	TargetTransformation = c("TRUE","FALSE"),
	Difference = c("TRUE","FALSE"),
	HoldoutTrain = c(6,18),
	Langevin = c("TRUE","FALSE"),
	NTrees = c(2500,5000),
	Depth = c(6,9),
	RandomStrength = c(0.75,1),
	L2_Leaf_Reg = c(3.0,4.0),
	RSM = c(0.75,"NULL"),
	GrowPolicy = c("SymmetricTree","Lossguide","Depthwise"),
	BootStrapType = c("Bayesian","MVS","No"))

	# Remove options that are not compatible with GPU (skip over this otherwise)
	Tuning <- Tuning[Langevin == "TRUE" \| (Langevin == "FALSE" & RSM == "NULL" & BootStrapType %in% c("Bayesian","No"))]

	# Randomize order of Tuning data.table
	Tuning <- Tuning[order(runif(.N))]

	# Load grid results and remove rows that have already been tested
	if(file.exists(file.path(Path, "Walmart_CARMA_Metrics.csv"))) {
	Metrics <- data.table::fread(file.path(Path, "Walmart_CARMA_Metrics.csv"))
	temp <- data.table::rbindlist(list(Metrics,Tuning), fill = TRUE)
	temp <- unique(temp, by = c(4:(ncol(temp)-1)))
	Tuning <- temp[is.na(RunTime)][, .SD, .SDcols = names(Tuning)]
	rm(Metrics,temp)
	}

	# Define the total number of runs
	TotalRuns <- Tuning[,.N]

	# Kick off feature + grid tuning
	for(Run in seq_len(TotalRuns)) {

	# Print run number
	for(zz in seq_len(100)) print(Run)

	# Use fresh data for each run
	xregs_new <- data.table::copy(xregs)
	data_new <- data.table::copy(data1)

	# Timer start
	StartTime <- Sys.time()

	# Run carma system
	CatBoostResults <- RemixAutoML::AutoCatBoostCARMA(

	# data args
	data = data_new,
	TimeWeights = if(Tuning[Run, TimeWeights] == "None") NULL else as.numeric(Tuning[Run, TimeWeights]),
	TargetColumnName = "Weekly_Sales",
	DateColumnName = "Date",
	HierarchGroups = NULL,
	GroupVariables = c("Store","Dept"),
	TimeUnit = "weeks",
	TimeGroups = if(Tuning[Run, MaxTimeGroups] == "weeks") "weeks" else if(Tuning[Run, MaxTimeGroups] == "months") c("weeks","months") else c("weeks","months","quarters"),

	# Production args
	TrainOnFull = TRUE,
	SplitRatios = c(1 - Tuning[Run, HoldoutTrain] / N2, Tuning[Run, HoldoutTrain] / N2),
	PartitionType = "random",
	FC_Periods = N2-N1,
	TaskType = TaskType,
	NumGPU = 1,
	Timer = TRUE,
	DebugMode = TRUE,

	# Target variable transformations
	TargetTransformation = as.logical(Tuning[Run, TargetTransformation]),
	Methods = c("BoxCox","Asinh","Log","LogPlus1","YeoJohnson"),
	Difference = as.logical(Tuning[Run, Difference]),
	NonNegativePred = TRUE,
	RoundPreds = FALSE,

	# Calendar-related features
	CalendarVariables = c("week","wom","month","quarter"),
	HolidayVariable = c("USPublicHolidays"),
	HolidayLags = c(1,2,3),
	HolidayMovingAverages = c(2,3),

	# Lags, moving averages, and other rolling stats
	Lags = if(Tuning[Run, MaxTimeGroups] == "weeks") c(1,2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == "months") list("weeks" = c(1,2,3,4,5,8,9,12,13,51,52,53), "months" = c(1,2,6,12)) else list("weeks" = c(1,2,3,4,5,8,9,12,13,51,52,53), "months" = c(1,2,6,12), "quarters" = c(1,2,3,4)),
	MA_Periods = if(Tuning[Run, MaxTimeGroups] == "weeks") c(2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == "months") list("weeks" = c(2,3,4,5,8,9,12,13,51,52,53), "months" = c(2,6,12)) else list("weeks" = c(2,3,4,5,8,9,12,13,51,52,53), "months" = c(2,6,12), "quarters" = c(2,3,4)),
	SD_Periods = NULL,
	Skew_Periods = NULL,
	Kurt_Periods = NULL,
	Quantile_Periods = NULL,
	Quantiles_Selected = NULL,

	# Bonus features
	AnomalyDetection = NULL,
	XREGS = xregs_new,
	FourierTerms = 0,
	TimeTrendVariable = TRUE,
	ZeroPadSeries = NULL,
	DataTruncate = FALSE,

	# ML insights
	PDFOutputPath = NULL,
	SaveDataPath = NULL,
	NumOfParDepPlots = 0L,

	# ML grid tuning args
	GridTune = FALSE,
	PassInGrid = NULL,
	ModelCount = 5,
	MaxRunsWithoutNewWinner = 50,
	MaxRunMinutes = 60*60,

	# ML loss functions
	EvalMetric = "RMSE",
	EvalMetricValue = 1,
	LossFunction = "RMSE",
	LossFunctionValue = 1,

	# ML tuning args
	NTrees = Tuning[Run, NTrees],
	Depth = Tuning[Run, Depth],
	Langevin = as.logical(Tuning[Run, Langevin]),
	DiffusionTemperature = 10000,

	# ML overfitting args
	LearningRate = 0.03,
	L2_Leaf_Reg = Tuning[Run, L2_Leaf_Reg],
	ModelSizeReg = 0.0,
	RSM = if(Tuning[Run, RSM] == "NULL") NULL else as.numeric(Tuning[Run, RSM]),
	MinDataInLeaf = 5,
	BorderCount = 254,
	RandomStrength = Tuning[Run, RandomStrength],
	SamplingUnit = "Group",

	# ML styles
	GrowPolicy = Tuning[Run, GrowPolicy],
	BootStrapType = Tuning[Run, BootStrapType],
	FeatureBorderType = "GreedyLogSum",
	SubSample = NULL,
	ScoreFunction = if(TaskType == "GPU") "NewtonL2" else "Cosine")

	# Timer End
	EndTime <- Sys.time()

	# Prepare data for evaluation
	Results <- CatBoostResults$Forecast
	data.table::setnames(Results, "Weekly_Sales", "bla")
	Results <- merge(Results, data, by = c("Store","Dept","Date"), all = FALSE)
	Results <- Results[is.na(bla)][, bla := NULL]

	# Create totals and subtotals
	Results <- data.table::groupingsets(
	x = Results,
	j = list(Predictions = sum(Predictions), Weekly_Sales = sum(Weekly_Sales)),
	by = c("Date", "Store", "Dept"),
	sets = list(c("Date", "Store", "Dept"), c("Store", "Dept"), "Store", "Dept", "Date"))

	# Fill NAs with "Total" for totals and subtotals
	for(cols in c("Store","Dept")) Results[, eval(cols) := data.table::fifelse(is.na(get(cols)), "Total", get(cols))]

	# Add error measures
	Results[, Weekly_MAE := abs(Weekly_Sales - Predictions)]
	Results[, Weekly_MAPE := Weekly_MAE / Weekly_Sales]

	# Weekly results
	Weekly_MAPE <- Results[, list(Weekly_MAPE = mean(Weekly_MAPE)), by = list(Store,Dept)]

	# Monthly results
	temp <- data.table::copy(Results)
	temp <- temp[, Date := lubridate::floor_date(Date, unit = "months")]
	temp <- temp[, lapply(.SD, sum), by = c("Date","Store","Dept"), .SDcols = c("Predictions", "Weekly_Sales")]
	temp[, Monthly_MAE := abs(Weekly_Sales - Predictions)]
	temp[, Monthly_MAPE := Monthly_MAE / Weekly_Sales]
	Monthly_MAPE <- temp[, list(Monthly_MAPE = mean(Monthly_MAPE)), by = list(Store,Dept)]

	# Collect metrics for Total (feel free to switch to something else or no filter at all)
	Metrics <- data.table::data.table(
	RunNumber = Run,
	Total_Weekly_MAPE = Weekly_MAPE[Store == "Total" & Dept == "Total", Weekly_MAPE],
	Total_Monthly_MAPE = Monthly_MAPE[Store == "Total" & Dept == "Total", Monthly_MAPE],
	Tuning[Run],
	RunTime = EndTime - StartTime)

	# Append to file (not overwrite)
	data.table::fwrite(Metrics, file = file.path(Path, "Walmart_CARMA_Metrics.csv"), append = TRUE)

	# Remove objects (clear space before new runs)
	rm(CatBoostResults, Results, temp, Weekly_MAE, Weekly_MAPE, Monthly_MAE, Monthly_MAPE)

	# Garbage collection because of GPU
	gc()
	}