Last active
March 16, 2023 03:48
-
-
Save plnnr/4dc65d705966d67744fb1a103d8e5721 to your computer and use it in GitHub Desktop.
Workflow examples of using tidymodels as well as two utility functions — generate simple table of multiple model metrics and effect plot for a logistic regression model.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Workflow Examples" | |
output: html_notebook | |
--- | |
## Train-test workflow examples | |
Two examples presented (2/2 is still TBD). The first is linear regression. The second is logistic regression. | |
### Linear regression `tidymodel` workflow | |
```{r linear-train-test-split} | |
# https://www.tidyverse.org/blog/2018/11/parsnip-0-0-1/ | |
library(tidyverse) | |
library(broom) | |
library(parsnip) | |
library(tidymodels) | |
library(patchwork) | |
set.seed(123) | |
pol <- rio::import("http://www.stat.columbia.edu/~gelman/arm/examples/pollution/pollution.dta") | |
# Step 1: Data resampling ------------------- | |
# create training and test datasets. First create data split object from your dataset | |
# `initial_split(leads_df, strata = purchased)` and then pass the object into `training()` | |
# and `testing()` | |
input_data <- pol # pol[-37,] # remove index 37 optionally | |
# Split the dataset, removing index 37 | |
pol_split <- initial_split(input_data, prop = 3/10, strata = mort) | |
# Set training and test datasets | |
pol_train <- training(pol_split) | |
pol_test <- testing(pol_split) | |
# Step 2: Model specification -------------------------------------------------- | |
# specify model with parsnip (e.g., `logistic_reg()` or `linear_reg()`) then | |
# `set_engine()` followed by `set_mode()`. | |
lm_mort_model <- linear_reg() %>% | |
set_engine('lm') %>% | |
set_mode('regression') | |
# Step 3: Feature engineering -------------------------------------------------- | |
# build feature engineering pipeline by specifying a recipe object that accepts | |
# a formula and the training dataset. Add preprocessing steps like `step_corr()`, | |
# `step_normalize()` and `step_dummy()`. | |
mort_recipe <- recipe(mort ~ nox + hc + so2, data = pol_train) %>% | |
step_log(all_predictors()) | |
# Step 4: Recipe training ------------------------------------------------------ | |
# train feature engineering steps on the training data by passing the `recipe` | |
# object to `prep()` and specifying the `training` parameter to the training dataset | |
mort_recipe_prep <- prep(mort_recipe, training = pol_train) | |
# Step 5: Preprocess training data --------------------------------------------- | |
# apply trained `recipe` to the training data and save the results for model fitting | |
# with `bake(new_data = NULL)`. Then also apply the trained `recipe` to the test | |
# dataset and save the results for modeling evaluation. | |
mort_training_prep <- bake(mort_recipe_prep, new_data = NULL) | |
mort_testing_prep <- bake(mort_recipe_prep, new_data = pol_test) | |
# Step 6: Model fitting and predictions ---------------------------------------- | |
# train the logistic regression model from step 2 with `fit()` using the preprocessed | |
# training dataset. Then obtain model predictions using the preprocessed test dataset | |
# with `predict()` to predict outcome values (`type = 'class'`) and estimated | |
# probabilities (`type = 'prob'`). | |
lm_mort_fit <- lm_mort_model %>% | |
# last_fit(mort ~ nox + hc + so2, split = pol_split) | |
fit(mort ~ nox + hc + so2, data = mort_training_prep) | |
mort_predictions <- predict(lm_mort_fit, new_data = mort_testing_prep) | |
# Step 7: Combine prediction results ------------------------------------------- | |
# combine predictions into a results dataset for `yardstick` metric functions. | |
# Starting with the split test dataset from step 1 or preprocessed test dataset | |
# from step 5, select the outcome variable and then `bind_cols()` from the two | |
# class and probabilities predictions from step 6. Produces model results dataframe | |
# with all required columns for `yardstick` metric functions. | |
pol_test_resuts <- mort_testing_prep %>% # pol_test %>% # Optionally use un-processed data | |
select(mort, hc, nox, so2) %>% | |
bind_cols(mort_predictions) | |
# Step 8: Model evaluation ----------------------------------------------------- | |
# Using the model results data from step 7, you can calculate a confusion matrix, | |
# sensitivity, specificity or other metrics. The way this workflow varies is that | |
# you're able to use feature engineering and incorporate all predictor variables. | |
ggplot(pol_test_resuts, aes(x=mort, y=.pred))+ | |
geom_point(alpha=0.5)+ | |
geom_abline(color='blue', linetype=2)+ | |
coord_obs_pred()+ | |
labs(title = "Observed vs Predicted Mortality Rates", | |
subtitle = "Testing Dataset", | |
y='Observed Mortality Rate', x='Predicted Mortality Rate') + | |
theme_minimal() | |
``` | |
### Logistic regression `tidymodel` workflow | |
TBD! | |
## Utility functions | |
#### Compare model outputs in a simple table format | |
```{r compare-models-example} | |
compare_models <- function(models){ | |
n_models <- length(models) | |
map_df(.x = models, | |
.f = function(fit){ | |
formula <- capture.output(fit$terms)[1] | |
glance(fit) %>% | |
mutate( | |
logLik = as.numeric(logLik), | |
formula = formula) | |
}) %>% | |
mutate(model_num = rep(1:n_models)) %>% | |
relocate(model_num, .before = everything()) | |
} | |
fit.mort <- lm(mort ~ nox + hc + so2, data = mort_training_prep) | |
comparison1 <- compare_models(list(fit.mort, fit.mort)) | |
comparison1 %>% | |
dplyr::select(model_num, logLik, AIC, BIC, deviance, df.residual) %>% | |
arrange(BIC) %>% | |
kable(., "simple") | |
``` | |
#### Effect plot function for logistic regression fitting | |
```{r effect-plot-function} | |
logistic_effect_plot_summary <- function(fit, model_name = "Model 1", print = TRUE){ | |
bvert <- function (v=0) geom_vline(xintercept = v, linetype = "dashed", color = "gray60") | |
formula <- capture.output(fit$terms)[1] | |
model_summary <- summary(fit) | |
model_tidy <- tidy(fit) %>% | |
mutate(loCI = estimate - 2 * std.error, | |
upCI = estimate + 2 * std.error) %>% | |
mutate(across(c(estimate, std.error, loCI, upCI), | |
.fns = list("exp" = ~exp(.)))) | |
# raw_effect_plot <- model_tidy %>% | |
# filter(term != "(Intercept)") %>% | |
# ggplot(aes(x = estimate, y = term)) + | |
# geom_pointrange(aes(xmin = loCI, xmax = upCI), | |
# size = 0.2, linewidth = 0.4) + | |
# bvert() + | |
# theme_minimal() + | |
# labs(x = "Log-Odds (β)", y = "", #color = "Model", | |
# title = paste0("Coefficient Estimate β of ", model_name), | |
# subtitle = formula) | |
effect_plot <- model_tidy %>% | |
filter(term != "(Intercept)") %>% | |
ggplot(aes(x = exp(estimate), y = term)) + | |
geom_pointrange(aes(xmin = exp(loCI), xmax = exp(upCI)), | |
size = 0.2, linewidth = 0.4) + | |
bvert(1) + scale_x_log10() + | |
theme_minimal() + | |
labs(x = "Odds-Ratio exp(β)", y = "", #color = "Model", | |
title = paste0("Odds-Ratio exp(β) for ", model_name), | |
subtitle = formula | |
) | |
# if(print == TRUE){print(effect_plot)} | |
return(list(model_summary = model_summary, | |
model_tidy = model_tidy, | |
effect_plot = effect_plot)) | |
} | |
## Insert your models here | |
m4s <- logistic_effect_plot_summary(m4); m4s$effect_plot | |
m9s <- logistic_effect_plot_summary(m9); m9s$effect_plot | |
m4s$effect_plot / m9s$effect_plot | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment