topepo/regression_diag_plots.R

## regression_diag_plots.R
library(tidymodels)
library(rules)
tidymodels_prefer()
theme_set(theme_bw())

library(doMC)
registerDoMC(cores = 20)

# ------------------------------------------------------------------------------


data(Chicago)
set.seed(1)
chi_split <- initial_split(Chicago)
chi_train <- training(chi_split)
chi_test <- testing(chi_split)

set.seed(2)
chi_folds <- sliding_period(chi_train, date, "month", assess_stop = 100, step = 1)
chi_folds

# ------------------------------------------------------------------------------

chi_rec <-
  recipe(ridership ~ ., data = chi_train) %>%
  step_date(date) %>%
  step_holiday(date) %>%
  update_role(date, new_role = "id")

cubist_spec <-
  cubist_rules(committees = 25, neighbors = 9) %>%
  set_engine("Cubist")

cubist_wflow <-
  workflow() %>%
  add_model(cubist_spec) %>%
  add_recipe(chi_rec)

# ------------------------------------------------------------------------------

ctrl_rs <- control_resamples(save_pred = TRUE)

cubist_res <-
  cubist_wflow %>%
  fit_resamples(resamples = chi_folds, control = ctrl_rs)

cubist_in_sample_predictions <-
  augment(cubist_res) %>%
  # for demo, add the day of the week
  mutate(day = lubridate::wday(date, label = TRUE, abbr = FALSE))

cubist_test_res <-
  cubist_wflow %>%
  last_fit(split = chi_split)

cubist_test_predictions <-  augment(cubist_test_res)

# ------------------------------------------------------------------------------

cubist_in_sample_predictions %>%
  ggplot(aes(x = ridership, y = .pred)) +
  geom_abline(lty = 2) +
  geom_point(alpha = .3) +
  coord_obs_pred() +
  ggtitle("Observed vs predicted")

cubist_in_sample_predictions %>%
  ggplot(aes(x = .pred, y = .resid)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_point(alpha = .3) +
  ggtitle("Residuals vs predicted")

# ------------------------------------------------------------------------------

cubist_in_sample_predictions %>%
  ggplot(aes(sample = .resid)) +
  stat_qq_line(lty = 2) +
  stat_qq(alpha = .2) +
  ggtitle("Normal probability plot")

# ------------------------------------------------------------------------------

cubist_in_sample_predictions %>%
  ggplot(aes(x = Clark_Lake, y = .resid)) +
  geom_point(alpha = .3) +
  ggtitle("Truth vs numeric predictor")

cubist_in_sample_predictions %>%
  ggplot(aes(y = reorder(day, .resid), x = .resid)) +
  geom_point(alpha = .3) +
  ylab("Day") +
  ggtitle("Truth vs factor predictor")
	library(tidymodels)
	library(rules)
	tidymodels_prefer()
	theme_set(theme_bw())

	library(doMC)
	registerDoMC(cores = 20)

	# ------------------------------------------------------------------------------


	data(Chicago)
	set.seed(1)
	chi_split <- initial_split(Chicago)
	chi_train <- training(chi_split)
	chi_test <- testing(chi_split)

	set.seed(2)
	chi_folds <- sliding_period(chi_train, date, "month", assess_stop = 100, step = 1)
	chi_folds

	# ------------------------------------------------------------------------------

	chi_rec <-
	recipe(ridership ~ ., data = chi_train) %>%
	step_date(date) %>%
	step_holiday(date) %>%
	update_role(date, new_role = "id")

	cubist_spec <-
	cubist_rules(committees = 25, neighbors = 9) %>%
	set_engine("Cubist")

	cubist_wflow <-
	workflow() %>%
	add_model(cubist_spec) %>%
	add_recipe(chi_rec)

	# ------------------------------------------------------------------------------

	ctrl_rs <- control_resamples(save_pred = TRUE)

	cubist_res <-
	cubist_wflow %>%
	fit_resamples(resamples = chi_folds, control = ctrl_rs)

	cubist_in_sample_predictions <-
	augment(cubist_res) %>%
	# for demo, add the day of the week
	mutate(day = lubridate::wday(date, label = TRUE, abbr = FALSE))

	cubist_test_res <-
	cubist_wflow %>%
	last_fit(split = chi_split)

	cubist_test_predictions <- augment(cubist_test_res)

	# ------------------------------------------------------------------------------

	cubist_in_sample_predictions %>%
	ggplot(aes(x = ridership, y = .pred)) +
	geom_abline(lty = 2) +
	geom_point(alpha = .3) +
	coord_obs_pred() +
	ggtitle("Observed vs predicted")

	cubist_in_sample_predictions %>%
	ggplot(aes(x = .pred, y = .resid)) +
	geom_hline(yintercept = 0, lty = 2) +
	geom_point(alpha = .3) +
	ggtitle("Residuals vs predicted")

	# ------------------------------------------------------------------------------

	cubist_in_sample_predictions %>%
	ggplot(aes(sample = .resid)) +
	stat_qq_line(lty = 2) +
	stat_qq(alpha = .2) +
	ggtitle("Normal probability plot")

	# ------------------------------------------------------------------------------

	cubist_in_sample_predictions %>%
	ggplot(aes(x = Clark_Lake, y = .resid)) +
	geom_point(alpha = .3) +
	ggtitle("Truth vs numeric predictor")

	cubist_in_sample_predictions %>%
	ggplot(aes(y = reorder(day, .resid), x = .resid)) +
	geom_point(alpha = .3) +
	ylab("Day") +
	ggtitle("Truth vs factor predictor")