Last active
February 24, 2024 21:34
-
-
Save jpfitzinger/84e834f8aafb1e9dd5ce4049286e8d3c to your computer and use it in GitHub Desktop.
tidyfit prediction workflow in R to compare the hierarchical feature regression to other linear and nonlinear regression methods
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) # data wrangling | |
library(rsample) # data splitting | |
library(ggpubr) # plotting | |
library(tidyfit) # statistical modeling | |
library(yardstick) # performance evaluation | |
# activate progress bars | |
progressr::handlers(global = TRUE) | |
# Boston house price data is available in the MASS package | |
data <- MASS::Boston | |
# function to split the data set into random training and testing samples | |
splitting_function <- function(data, seed) { | |
set.seed(seed) | |
# use 50% of the data for testing | |
splits <- initial_split(data, prop = 0.5) | |
# get training and testing samples | |
data_train <- training(splits) | |
data_test <- testing(splits) | |
# return a tibble | |
bind_rows( | |
mutate(data_train, DATASET = "TRAIN"), | |
mutate(data_test, DATASET = "TEST") | |
) | |
} | |
# the model_df is used for fitting and contains 50 random train-test splits | |
model_df <- map_dfr(1:50, ~splitting_function(data, .), .id = "ID") | |
# models can be fit using tidyfit::regress | |
# https://tidyfit.residualmetrics.com | |
# !!IMPORTANT!! | |
# The below includes some large models + cross-validation + 50 samples. | |
# Unless working on a very powerful machine it is better to fit models | |
# in smaller groups and using 'bind_rows()' to combine them afterwards. | |
linear_models <- model_df |> | |
# fit on training data | |
filter(DATASET == "TRAIN") |> | |
# when grouping by ID models will be fit separately for each group | |
group_by(ID) |> | |
# model fitting | |
regress( | |
# regress 'medv' on all variables incl. interactions | |
medv ~ .*., | |
# the HFR model | |
HFR = m("hfr", kappa = seq(0, 1, by = 0.01)), | |
# some comparison methods | |
`Linear Regression` = m("lm"), | |
`Bayes. Model Averaging` = m("bma", iter = 10000), | |
`Forward Selection` = m("subset", method = "forward", IC = "AIC"), | |
`Spike & Slab Regression` = m("spikeslab", niter = 10000), | |
`Elastic Net` = m("enet"), | |
`Adaptive Lasso` = m("adalasso"), | |
`Gradient Boosting` = m("boost", mstop = c(50, 100, 200)), | |
`Principal Components` = m("pcr"), | |
`Partial Least Squares` = m("plsr"), | |
`Genetic Algorithm` = m("genetic", populationSize = 1000, numGenerations = 50, | |
statistic = "AIC", maxVariables = 20), | |
# additional args for CV routine to determine hyperparameters | |
.cv = "vfold_cv", .cv_args = list(v = 10), | |
# ignore DATASET column | |
.mask = "DATASET") | |
# fit the random forecast separately since it constructs interaction internally | |
nonlinear_models <- model_df |> | |
filter(DATASET == "TRAIN") |> | |
group_by(ID) |> | |
regress(medv ~ ., `Random Forest` = m("rf"), .cv = "vfold_cv", .cv_args = list(v = 10), | |
.mask = "DATASET") | |
fitted_models <- bind_rows(linear_models, nonlinear_models) | |
# generate predictions | |
predict_df <- predict(fitted_models, filter(model_df, DATASET == "TEST")) | |
eval_df <- rmse(predict_df, truth, prediction) |> | |
mutate(model = str_wrap(model, 10)) | |
# arrange in descending order for the plot | |
order <- eval_df |> | |
group_by(model) |> | |
summarise(.estimate = median(.estimate)) |> | |
arrange(desc(.estimate)) |> | |
pull(model) | |
# plot | |
eval_df |> | |
ggerrorplot("model", ".estimate", desc_stat = "median_q1q3", order = order, | |
error.plot = "errorbar", size = 1.5, width = .25) |> | |
ggpar(ggtheme = theme_bw(), ylab = "Out-of-sample RMSE", xlab = FALSE, palette = "jco", | |
legend.title = "", legend = "bottom") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment