jpfitzinger/HierarchicalFeatureRegression_BostonHousePrice_Example.R Secret

## HierarchicalFeatureRegression_BostonHousePrice_Example.R
library(tidyverse) # data wrangling
library(rsample)   # data splitting
library(ggpubr)    # plotting
library(tidyfit)   # statistical modeling
library(yardstick) # performance evaluation

# activate progress bars
progressr::handlers(global = TRUE)

# Boston house price data is available in the MASS package
data <- MASS::Boston

# function to split the data set into random training and testing samples
splitting_function <- function(data, seed) {
  set.seed(seed)
  # use 50% of the data for testing
  splits <- initial_split(data, prop = 0.5)
  # get training and testing samples
  data_train <- training(splits)
  data_test <- testing(splits)
  # return a tibble
  bind_rows(
    mutate(data_train, DATASET = "TRAIN"),
    mutate(data_test, DATASET = "TEST")
  )
}

# the model_df is used for fitting and contains 50 random train-test splits
model_df <- map_dfr(1:50, ~splitting_function(data, .), .id = "ID")

# models can be fit using tidyfit::regress
# https://tidyfit.residualmetrics.com

# !!IMPORTANT!!
# The below includes some large models + cross-validation + 50 samples.
# Unless working on a very powerful machine it is better to fit models
# in smaller groups and using 'bind_rows()' to combine them afterwards.

linear_models <- model_df |>
  # fit on training data
  filter(DATASET == "TRAIN") |>
  # when grouping by ID models will be fit separately for each group
  group_by(ID) |>
  # model fitting
  regress(
    # regress 'medv' on all variables incl. interactions
    medv ~ .*.,
    # the HFR model
    HFR = m("hfr", kappa = seq(0, 1, by = 0.01)),
    # some comparison methods
    `Linear Regression` = m("lm"),
    `Bayes. Model Averaging` = m("bma", iter = 10000),
    `Forward Selection` = m("subset", method = "forward", IC = "AIC"),
    `Spike & Slab Regression` = m("spikeslab", niter = 10000),
    `Elastic Net` = m("enet"),
    `Adaptive Lasso` = m("adalasso"),
    `Gradient Boosting` = m("boost", mstop = c(50, 100, 200)),
    `Principal Components` = m("pcr"),
    `Partial Least Squares` = m("plsr"),
    `Genetic Algorithm` = m("genetic", populationSize = 1000, numGenerations = 50,
                            statistic = "AIC", maxVariables = 20),
    # additional args for CV routine to determine hyperparameters
    .cv = "vfold_cv", .cv_args = list(v = 10),
    # ignore DATASET column
    .mask = "DATASET")

# fit the random forecast separately since it constructs interaction internally
nonlinear_models <- model_df |>
  filter(DATASET == "TRAIN") |>
  group_by(ID) |>
  regress(medv ~ ., `Random Forest` = m("rf"), .cv = "vfold_cv", .cv_args = list(v = 10),
          .mask = "DATASET")

fitted_models <- bind_rows(linear_models, nonlinear_models)

# generate predictions
predict_df <- predict(fitted_models, filter(model_df, DATASET == "TEST"))
eval_df <- rmse(predict_df, truth, prediction) |>
  mutate(model = str_wrap(model, 10))

# arrange in descending order for the plot
order <- eval_df |>
  group_by(model) |>
  summarise(.estimate = median(.estimate)) |>
  arrange(desc(.estimate)) |>
  pull(model)

# plot
eval_df |>
  ggerrorplot("model", ".estimate", desc_stat = "median_q1q3", order = order,
              error.plot = "errorbar", size = 1.5, width = .25) |>
  ggpar(ggtheme = theme_bw(), ylab = "Out-of-sample RMSE", xlab = FALSE, palette = "jco",
        legend.title = "", legend = "bottom")
	library(tidyverse) # data wrangling
	library(rsample) # data splitting
	library(ggpubr) # plotting
	library(tidyfit) # statistical modeling
	library(yardstick) # performance evaluation

	# activate progress bars
	progressr::handlers(global = TRUE)

	# Boston house price data is available in the MASS package
	data <- MASS::Boston

	# function to split the data set into random training and testing samples
	splitting_function <- function(data, seed) {
	set.seed(seed)
	# use 50% of the data for testing
	splits <- initial_split(data, prop = 0.5)
	# get training and testing samples
	data_train <- training(splits)
	data_test <- testing(splits)
	# return a tibble
	bind_rows(
	mutate(data_train, DATASET = "TRAIN"),
	mutate(data_test, DATASET = "TEST")
	)
	}

	# the model_df is used for fitting and contains 50 random train-test splits
	model_df <- map_dfr(1:50, ~splitting_function(data, .), .id = "ID")

	# models can be fit using tidyfit::regress
	# https://tidyfit.residualmetrics.com

	# !!IMPORTANT!!
	# The below includes some large models + cross-validation + 50 samples.
	# Unless working on a very powerful machine it is better to fit models
	# in smaller groups and using 'bind_rows()' to combine them afterwards.

	linear_models <- model_df \|>
	# fit on training data
	filter(DATASET == "TRAIN") \|>
	# when grouping by ID models will be fit separately for each group
	group_by(ID) \|>
	# model fitting
	regress(
	# regress 'medv' on all variables incl. interactions
	medv ~ .*.,
	# the HFR model
	HFR = m("hfr", kappa = seq(0, 1, by = 0.01)),
	# some comparison methods
	`Linear Regression` = m("lm"),
	`Bayes. Model Averaging` = m("bma", iter = 10000),
	`Forward Selection` = m("subset", method = "forward", IC = "AIC"),
	`Spike & Slab Regression` = m("spikeslab", niter = 10000),
	`Elastic Net` = m("enet"),
	`Adaptive Lasso` = m("adalasso"),
	`Gradient Boosting` = m("boost", mstop = c(50, 100, 200)),
	`Principal Components` = m("pcr"),
	`Partial Least Squares` = m("plsr"),
	`Genetic Algorithm` = m("genetic", populationSize = 1000, numGenerations = 50,
	statistic = "AIC", maxVariables = 20),
	# additional args for CV routine to determine hyperparameters
	.cv = "vfold_cv", .cv_args = list(v = 10),
	# ignore DATASET column
	.mask = "DATASET")

	# fit the random forecast separately since it constructs interaction internally
	nonlinear_models <- model_df \|>
	filter(DATASET == "TRAIN") \|>
	group_by(ID) \|>
	regress(medv ~ ., `Random Forest` = m("rf"), .cv = "vfold_cv", .cv_args = list(v = 10),
	.mask = "DATASET")

	fitted_models <- bind_rows(linear_models, nonlinear_models)

	# generate predictions
	predict_df <- predict(fitted_models, filter(model_df, DATASET == "TEST"))
	eval_df <- rmse(predict_df, truth, prediction) \|>
	mutate(model = str_wrap(model, 10))

	# arrange in descending order for the plot
	order <- eval_df \|>
	group_by(model) \|>
	summarise(.estimate = median(.estimate)) \|>
	arrange(desc(.estimate)) \|>
	pull(model)

	# plot
	eval_df \|>
	ggerrorplot("model", ".estimate", desc_stat = "median_q1q3", order = order,
	error.plot = "errorbar", size = 1.5, width = .25) \|>
	ggpar(ggtheme = theme_bw(), ylab = "Out-of-sample RMSE", xlab = FALSE, palette = "jco",
	legend.title = "", legend = "bottom")