Skip to content

Instantly share code, notes, and snippets.

@jpfitzinger
Last active February 24, 2024 21:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpfitzinger/84e834f8aafb1e9dd5ce4049286e8d3c to your computer and use it in GitHub Desktop.
Save jpfitzinger/84e834f8aafb1e9dd5ce4049286e8d3c to your computer and use it in GitHub Desktop.
tidyfit prediction workflow in R to compare the hierarchical feature regression to other linear and nonlinear regression methods
library(tidyverse) # data wrangling
library(rsample) # data splitting
library(ggpubr) # plotting
library(tidyfit) # statistical modeling
library(yardstick) # performance evaluation
# activate progress bars
progressr::handlers(global = TRUE)
# Boston house price data is available in the MASS package
data <- MASS::Boston
# function to split the data set into random training and testing samples
splitting_function <- function(data, seed) {
set.seed(seed)
# use 50% of the data for testing
splits <- initial_split(data, prop = 0.5)
# get training and testing samples
data_train <- training(splits)
data_test <- testing(splits)
# return a tibble
bind_rows(
mutate(data_train, DATASET = "TRAIN"),
mutate(data_test, DATASET = "TEST")
)
}
# the model_df is used for fitting and contains 50 random train-test splits
model_df <- map_dfr(1:50, ~splitting_function(data, .), .id = "ID")
# models can be fit using tidyfit::regress
# https://tidyfit.residualmetrics.com
# !!IMPORTANT!!
# The below includes some large models + cross-validation + 50 samples.
# Unless working on a very powerful machine it is better to fit models
# in smaller groups and using 'bind_rows()' to combine them afterwards.
linear_models <- model_df |>
# fit on training data
filter(DATASET == "TRAIN") |>
# when grouping by ID models will be fit separately for each group
group_by(ID) |>
# model fitting
regress(
# regress 'medv' on all variables incl. interactions
medv ~ .*.,
# the HFR model
HFR = m("hfr", kappa = seq(0, 1, by = 0.01)),
# some comparison methods
`Linear Regression` = m("lm"),
`Bayes. Model Averaging` = m("bma", iter = 10000),
`Forward Selection` = m("subset", method = "forward", IC = "AIC"),
`Spike & Slab Regression` = m("spikeslab", niter = 10000),
`Elastic Net` = m("enet"),
`Adaptive Lasso` = m("adalasso"),
`Gradient Boosting` = m("boost", mstop = c(50, 100, 200)),
`Principal Components` = m("pcr"),
`Partial Least Squares` = m("plsr"),
`Genetic Algorithm` = m("genetic", populationSize = 1000, numGenerations = 50,
statistic = "AIC", maxVariables = 20),
# additional args for CV routine to determine hyperparameters
.cv = "vfold_cv", .cv_args = list(v = 10),
# ignore DATASET column
.mask = "DATASET")
# fit the random forecast separately since it constructs interaction internally
nonlinear_models <- model_df |>
filter(DATASET == "TRAIN") |>
group_by(ID) |>
regress(medv ~ ., `Random Forest` = m("rf"), .cv = "vfold_cv", .cv_args = list(v = 10),
.mask = "DATASET")
fitted_models <- bind_rows(linear_models, nonlinear_models)
# generate predictions
predict_df <- predict(fitted_models, filter(model_df, DATASET == "TEST"))
eval_df <- rmse(predict_df, truth, prediction) |>
mutate(model = str_wrap(model, 10))
# arrange in descending order for the plot
order <- eval_df |>
group_by(model) |>
summarise(.estimate = median(.estimate)) |>
arrange(desc(.estimate)) |>
pull(model)
# plot
eval_df |>
ggerrorplot("model", ".estimate", desc_stat = "median_q1q3", order = order,
error.plot = "errorbar", size = 1.5, width = .25) |>
ggpar(ggtheme = theme_bw(), ylab = "Out-of-sample RMSE", xlab = FALSE, palette = "jco",
legend.title = "", legend = "bottom")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment