plnnr/tidymodels-workflow-and-plotting-utilities.Rmd

## tidymodels-workflow-and-plotting-utilities.Rmd
---
title: "Workflow Examples"
output: html_notebook
---

## Train-test workflow examples
Two examples presented (2/2 is still TBD). The first is linear regression. The second is logistic regression.

### Linear regression `tidymodel` workflow

```{r linear-train-test-split}
# https://www.tidyverse.org/blog/2018/11/parsnip-0-0-1/
library(tidyverse)
library(broom)
library(parsnip)
library(tidymodels)
library(patchwork)

set.seed(123)

pol <- rio::import("http://www.stat.columbia.edu/~gelman/arm/examples/pollution/pollution.dta")

# Step 1: Data resampling -------------------
# create training and test datasets. First create data split object from your dataset
# `initial_split(leads_df, strata = purchased)` and then pass the object into `training()`
# and `testing()`

input_data <- pol # pol[-37,] # remove index 37 optionally

# Split the dataset, removing index 37
pol_split <- initial_split(input_data, prop = 3/10, strata = mort)

# Set training and test datasets
pol_train <- training(pol_split)
pol_test <- testing(pol_split)

# Step 2: Model specification --------------------------------------------------
# specify model with parsnip (e.g., `logistic_reg()` or `linear_reg()`) then
# `set_engine()` followed by `set_mode()`.
lm_mort_model <- linear_reg() %>%
  set_engine('lm') %>%
  set_mode('regression')

# Step 3: Feature engineering --------------------------------------------------
# build feature engineering pipeline by specifying a recipe object that accepts
# a formula and the training dataset. Add preprocessing steps like `step_corr()`,
# `step_normalize()` and `step_dummy()`.
mort_recipe <- recipe(mort ~ nox + hc + so2, data = pol_train) %>%
  step_log(all_predictors())

# Step 4: Recipe training ------------------------------------------------------
# train feature engineering steps on the training data by passing the `recipe`
# object to `prep()` and specifying the `training` parameter to the training dataset
mort_recipe_prep <- prep(mort_recipe, training = pol_train)

# Step 5: Preprocess training data ---------------------------------------------
# apply trained `recipe` to the training data and save the results for model fitting
# with `bake(new_data = NULL)`. Then also apply the trained `recipe` to the test
# dataset and save the results for modeling evaluation.
mort_training_prep <- bake(mort_recipe_prep, new_data = NULL)
mort_testing_prep <- bake(mort_recipe_prep, new_data = pol_test)


# Step 6: Model fitting and predictions ----------------------------------------
# train the logistic regression model from step 2 with `fit()` using the preprocessed
# training dataset. Then obtain model predictions using the preprocessed test dataset
# with `predict()` to predict outcome values (`type = 'class'`) and estimated
# probabilities (`type = 'prob'`).
lm_mort_fit <- lm_mort_model %>%
  # last_fit(mort ~ nox + hc + so2, split =  pol_split)
  fit(mort ~ nox + hc + so2, data = mort_training_prep)


mort_predictions <- predict(lm_mort_fit, new_data = mort_testing_prep)

# Step 7: Combine prediction results -------------------------------------------
# combine predictions into a results dataset for `yardstick` metric functions.
# Starting with the split test dataset from step 1 or preprocessed test dataset
# from step 5, select the outcome variable and then `bind_cols()` from the two
# class and probabilities predictions from step 6. Produces model results dataframe
# with all required columns for `yardstick` metric functions.

pol_test_resuts <- mort_testing_prep %>% # pol_test %>% # Optionally use un-processed data
  select(mort, hc, nox, so2) %>%
  bind_cols(mort_predictions)

# Step 8: Model evaluation -----------------------------------------------------
# Using the model results data from step 7, you can calculate a confusion matrix,
# sensitivity, specificity or other metrics. The way this workflow varies is that
# you're able to use feature engineering and incorporate all predictor variables.

ggplot(pol_test_resuts, aes(x=mort, y=.pred))+
  geom_point(alpha=0.5)+
  geom_abline(color='blue', linetype=2)+
  coord_obs_pred()+
  labs(title = "Observed vs Predicted Mortality Rates",
       subtitle = "Testing Dataset",
       y='Observed Mortality Rate', x='Predicted Mortality Rate') +
  theme_minimal()
```

### Logistic regression `tidymodel` workflow

TBD!

## Utility functions

#### Compare model outputs in a simple table format

```{r compare-models-example}
compare_models <- function(models){
  n_models <- length(models)
  map_df(.x = models,
         .f = function(fit){
           formula <- capture.output(fit$terms)[1]
           glance(fit) %>%
             mutate(
               logLik = as.numeric(logLik),
               formula = formula)
         }) %>%
    mutate(model_num = rep(1:n_models)) %>%
    relocate(model_num, .before = everything())
}

fit.mort <- lm(mort ~ nox + hc + so2, data = mort_training_prep)

comparison1 <- compare_models(list(fit.mort, fit.mort))

comparison1 %>%
  dplyr::select(model_num, logLik, AIC, BIC, deviance, df.residual) %>%
  arrange(BIC) %>%
  kable(., "simple")
```

#### Effect plot function for logistic regression fitting

```{r effect-plot-function}

logistic_effect_plot_summary <- function(fit, model_name = "Model 1", print = TRUE){
  bvert <- function (v=0) geom_vline(xintercept = v, linetype = "dashed", color = "gray60")

  formula <- capture.output(fit$terms)[1]

  model_summary <- summary(fit)

  model_tidy <- tidy(fit) %>%
    mutate(loCI = estimate - 2 * std.error,
           upCI = estimate + 2 * std.error) %>%
    mutate(across(c(estimate, std.error, loCI, upCI),
                  .fns = list("exp" = ~exp(.))))

  # raw_effect_plot <- model_tidy %>%
  #   filter(term != "(Intercept)") %>%
  #   ggplot(aes(x = estimate, y = term)) +
  #   geom_pointrange(aes(xmin = loCI, xmax = upCI),
  #                   size = 0.2, linewidth = 0.4) +
  #   bvert() +
  #   theme_minimal() +
  #   labs(x = "Log-Odds (β)", y = "", #color = "Model",
  #        title = paste0("Coefficient Estimate β of ", model_name),
  #        subtitle = formula)

  effect_plot <- model_tidy %>%
    filter(term != "(Intercept)") %>%
    ggplot(aes(x = exp(estimate), y = term)) +
    geom_pointrange(aes(xmin = exp(loCI), xmax = exp(upCI)),
                    size = 0.2, linewidth = 0.4) +
    bvert(1) + scale_x_log10() +
    theme_minimal() +
    labs(x = "Odds-Ratio exp(β)", y = "", #color = "Model",
         title = paste0("Odds-Ratio exp(β) for ", model_name),
         subtitle = formula
         )

  # if(print == TRUE){print(effect_plot)}

  return(list(model_summary = model_summary,
              model_tidy = model_tidy,
              effect_plot = effect_plot))
}


## Insert your models here
m4s <- logistic_effect_plot_summary(m4); m4s$effect_plot
m9s <- logistic_effect_plot_summary(m9); m9s$effect_plot
m4s$effect_plot / m9s$effect_plot
```
	---
	title: "Workflow Examples"
	output: html_notebook
	---

	## Train-test workflow examples
	Two examples presented (2/2 is still TBD). The first is linear regression. The second is logistic regression.

	### Linear regression `tidymodel` workflow

	```{r linear-train-test-split}
	# https://www.tidyverse.org/blog/2018/11/parsnip-0-0-1/
	library(tidyverse)
	library(broom)
	library(parsnip)
	library(tidymodels)
	library(patchwork)

	set.seed(123)

	pol <- rio::import("http://www.stat.columbia.edu/~gelman/arm/examples/pollution/pollution.dta")

	# Step 1: Data resampling -------------------
	# create training and test datasets. First create data split object from your dataset
	# `initial_split(leads_df, strata = purchased)` and then pass the object into `training()`
	# and `testing()`

	input_data <- pol # pol[-37,] # remove index 37 optionally

	# Split the dataset, removing index 37
	pol_split <- initial_split(input_data, prop = 3/10, strata = mort)

	# Set training and test datasets
	pol_train <- training(pol_split)
	pol_test <- testing(pol_split)

	# Step 2: Model specification --------------------------------------------------
	# specify model with parsnip (e.g., `logistic_reg()` or `linear_reg()`) then
	# `set_engine()` followed by `set_mode()`.
	lm_mort_model <- linear_reg() %>%
	set_engine('lm') %>%
	set_mode('regression')

	# Step 3: Feature engineering --------------------------------------------------
	# build feature engineering pipeline by specifying a recipe object that accepts
	# a formula and the training dataset. Add preprocessing steps like `step_corr()`,
	# `step_normalize()` and `step_dummy()`.
	mort_recipe <- recipe(mort ~ nox + hc + so2, data = pol_train) %>%
	step_log(all_predictors())

	# Step 4: Recipe training ------------------------------------------------------
	# train feature engineering steps on the training data by passing the `recipe`
	# object to `prep()` and specifying the `training` parameter to the training dataset
	mort_recipe_prep <- prep(mort_recipe, training = pol_train)

	# Step 5: Preprocess training data ---------------------------------------------
	# apply trained `recipe` to the training data and save the results for model fitting
	# with `bake(new_data = NULL)`. Then also apply the trained `recipe` to the test
	# dataset and save the results for modeling evaluation.
	mort_training_prep <- bake(mort_recipe_prep, new_data = NULL)
	mort_testing_prep <- bake(mort_recipe_prep, new_data = pol_test)


	# Step 6: Model fitting and predictions ----------------------------------------
	# train the logistic regression model from step 2 with `fit()` using the preprocessed
	# training dataset. Then obtain model predictions using the preprocessed test dataset
	# with `predict()` to predict outcome values (`type = 'class'`) and estimated
	# probabilities (`type = 'prob'`).
	lm_mort_fit <- lm_mort_model %>%
	# last_fit(mort ~ nox + hc + so2, split = pol_split)
	fit(mort ~ nox + hc + so2, data = mort_training_prep)


	mort_predictions <- predict(lm_mort_fit, new_data = mort_testing_prep)

	# Step 7: Combine prediction results -------------------------------------------
	# combine predictions into a results dataset for `yardstick` metric functions.
	# Starting with the split test dataset from step 1 or preprocessed test dataset
	# from step 5, select the outcome variable and then `bind_cols()` from the two
	# class and probabilities predictions from step 6. Produces model results dataframe
	# with all required columns for `yardstick` metric functions.

	pol_test_resuts <- mort_testing_prep %>% # pol_test %>% # Optionally use un-processed data
	select(mort, hc, nox, so2) %>%
	bind_cols(mort_predictions)

	# Step 8: Model evaluation -----------------------------------------------------
	# Using the model results data from step 7, you can calculate a confusion matrix,
	# sensitivity, specificity or other metrics. The way this workflow varies is that
	# you're able to use feature engineering and incorporate all predictor variables.

	ggplot(pol_test_resuts, aes(x=mort, y=.pred))+
	geom_point(alpha=0.5)+
	geom_abline(color='blue', linetype=2)+
	coord_obs_pred()+
	labs(title = "Observed vs Predicted Mortality Rates",
	subtitle = "Testing Dataset",
	y='Observed Mortality Rate', x='Predicted Mortality Rate') +
	theme_minimal()
	```

	### Logistic regression `tidymodel` workflow

	TBD!

	## Utility functions

	#### Compare model outputs in a simple table format

	```{r compare-models-example}
	compare_models <- function(models){
	n_models <- length(models)
	map_df(.x = models,
	.f = function(fit){
	formula <- capture.output(fit$terms)[1]
	glance(fit) %>%
	mutate(
	logLik = as.numeric(logLik),
	formula = formula)
	}) %>%
	mutate(model_num = rep(1:n_models)) %>%
	relocate(model_num, .before = everything())
	}

	fit.mort <- lm(mort ~ nox + hc + so2, data = mort_training_prep)

	comparison1 <- compare_models(list(fit.mort, fit.mort))

	comparison1 %>%
	dplyr::select(model_num, logLik, AIC, BIC, deviance, df.residual) %>%
	arrange(BIC) %>%
	kable(., "simple")
	```

	#### Effect plot function for logistic regression fitting

	```{r effect-plot-function}

	logistic_effect_plot_summary <- function(fit, model_name = "Model 1", print = TRUE){
	bvert <- function (v=0) geom_vline(xintercept = v, linetype = "dashed", color = "gray60")

	formula <- capture.output(fit$terms)[1]

	model_summary <- summary(fit)

	model_tidy <- tidy(fit) %>%
	mutate(loCI = estimate - 2 * std.error,
	upCI = estimate + 2 * std.error) %>%
	mutate(across(c(estimate, std.error, loCI, upCI),
	.fns = list("exp" = ~exp(.))))

	# raw_effect_plot <- model_tidy %>%
	# filter(term != "(Intercept)") %>%
	# ggplot(aes(x = estimate, y = term)) +
	# geom_pointrange(aes(xmin = loCI, xmax = upCI),
	# size = 0.2, linewidth = 0.4) +
	# bvert() +
	# theme_minimal() +
	# labs(x = "Log-Odds (β)", y = "", #color = "Model",
	# title = paste0("Coefficient Estimate β of ", model_name),
	# subtitle = formula)

	effect_plot <- model_tidy %>%
	filter(term != "(Intercept)") %>%
	ggplot(aes(x = exp(estimate), y = term)) +
	geom_pointrange(aes(xmin = exp(loCI), xmax = exp(upCI)),
	size = 0.2, linewidth = 0.4) +
	bvert(1) + scale_x_log10() +
	theme_minimal() +
	labs(x = "Odds-Ratio exp(β)", y = "", #color = "Model",
	title = paste0("Odds-Ratio exp(β) for ", model_name),
	subtitle = formula
	)

	# if(print == TRUE){print(effect_plot)}

	return(list(model_summary = model_summary,
	model_tidy = model_tidy,
	effect_plot = effect_plot))
	}


	## Insert your models here
	m4s <- logistic_effect_plot_summary(m4); m4s$effect_plot
	m9s <- logistic_effect_plot_summary(m9); m9s$effect_plot
	m4s$effect_plot / m9s$effect_plot
	```