topepo/two_class_diag_plots.R

## two_class_diag_plots.R
library(tidymodels)
tidymodels_prefer()
theme_set(theme_bw())

library(doMC)
registerDoMC(cores = 20)

# ------------------------------------------------------------------------------

data(ad_data)
set.seed(1)
ad_split <- initial_split(ad_data)
ad_train <- training(ad_split)
ad_test <- testing(ad_split)

set.seed(2)
ad_folds <- vfold_cv(ad_train, repeats = 5)

# ------------------------------------------------------------------------------

boost_spec <-
  boost_tree() %>%
  set_engine("C5.0") %>%
  set_mode("classification")

ctrl_rs <- control_resamples(save_pred = TRUE)

boost_res <-
  boost_spec %>%
  fit_resamples(Class ~ ., resamples = ad_folds, control = ctrl_rs)

boost_in_sample_predictions <- augment(boost_res)

boost_test_res <-
  boost_spec %>%
  last_fit(Class ~ ., split = ad_split)

boost_test_predictions <-  augment(boost_test_res)

# ------------------------------------------------------------------------------

prob_breaks <- (2:9)/10
prob_eps <- 0.001
prob_bins <- 0.025

# ------------------------------------------------------------------------------

boost_in_sample_predictions %>%
  ggplot(aes(x = .pred_Impaired)) +
  geom_histogram(binwidth = prob_bins, col = "white") +
  facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
  ggtitle("Predicted probabilities versus true class") +
  xlim(0:1)

boost_in_sample_predictions %>%
  conf_mat(truth = Class, estimate = .pred_class) %>%
  autoplot()

# ------------------------------------------------------------------------------

boost_in_sample_predictions %>%
  mutate(
    .pred_Impaired =
      case_when(
        .pred_Impaired > 1 - prob_eps ~ 1 - prob_eps,
        .pred_Impaired  <    prob_eps ~     prob_eps,
        TRUE ~ .pred_Impaired
      )
  ) %>%
  ggplot(aes(x = p_tau, y = .pred_Impaired)) +
  geom_point()+
  facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
  ggtitle("Predicted probabilities versus numeric variable") +
  # We should make a custom transformation that handles probs at 0 and 1
  scale_y_continuous(trans = scales::logit_trans(), breaks = prob_breaks)

boost_in_sample_predictions %>%
  mutate(
    .pred_Impaired =
      case_when(
        .pred_Impaired > 1 - prob_eps ~ 1 - prob_eps,
        .pred_Impaired  <    prob_eps ~     prob_eps,
        TRUE ~ .pred_Impaired
      )
  ) %>%
  ggplot(aes(y = Genotype, x = .pred_Impaired)) +
  geom_point() +
  facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
  ggtitle("Predicted probabilities versus factor variable") +
  scale_x_continuous(trans = scales::logit_trans(), breaks = prob_breaks)

# ------------------------------------------------------------------------------

boost_in_sample_predictions %>%
  roc_curve(truth = Class, .pred_Impaired) %>%
  autoplot()

boost_in_sample_predictions %>%
  pr_curve(truth = Class, .pred_Impaired) %>%
  autoplot()
	library(tidymodels)
	tidymodels_prefer()
	theme_set(theme_bw())

	library(doMC)
	registerDoMC(cores = 20)

	# ------------------------------------------------------------------------------

	data(ad_data)
	set.seed(1)
	ad_split <- initial_split(ad_data)
	ad_train <- training(ad_split)
	ad_test <- testing(ad_split)

	set.seed(2)
	ad_folds <- vfold_cv(ad_train, repeats = 5)

	# ------------------------------------------------------------------------------

	boost_spec <-
	boost_tree() %>%
	set_engine("C5.0") %>%
	set_mode("classification")

	ctrl_rs <- control_resamples(save_pred = TRUE)

	boost_res <-
	boost_spec %>%
	fit_resamples(Class ~ ., resamples = ad_folds, control = ctrl_rs)

	boost_in_sample_predictions <- augment(boost_res)

	boost_test_res <-
	boost_spec %>%
	last_fit(Class ~ ., split = ad_split)

	boost_test_predictions <- augment(boost_test_res)

	# ------------------------------------------------------------------------------

	prob_breaks <- (2:9)/10
	prob_eps <- 0.001
	prob_bins <- 0.025

	# ------------------------------------------------------------------------------

	boost_in_sample_predictions %>%
	ggplot(aes(x = .pred_Impaired)) +
	geom_histogram(binwidth = prob_bins, col = "white") +
	facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
	ggtitle("Predicted probabilities versus true class") +
	xlim(0:1)

	boost_in_sample_predictions %>%
	conf_mat(truth = Class, estimate = .pred_class) %>%
	autoplot()

	# ------------------------------------------------------------------------------

	boost_in_sample_predictions %>%
	mutate(
	.pred_Impaired =
	case_when(
	.pred_Impaired > 1 - prob_eps ~ 1 - prob_eps,
	.pred_Impaired < prob_eps ~ prob_eps,
	TRUE ~ .pred_Impaired
	)
	) %>%
	ggplot(aes(x = p_tau, y = .pred_Impaired)) +
	geom_point()+
	facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
	ggtitle("Predicted probabilities versus numeric variable") +
	# We should make a custom transformation that handles probs at 0 and 1
	scale_y_continuous(trans = scales::logit_trans(), breaks = prob_breaks)

	boost_in_sample_predictions %>%
	mutate(
	.pred_Impaired =
	case_when(
	.pred_Impaired > 1 - prob_eps ~ 1 - prob_eps,
	.pred_Impaired < prob_eps ~ prob_eps,
	TRUE ~ .pred_Impaired
	)
	) %>%
	ggplot(aes(y = Genotype, x = .pred_Impaired)) +
	geom_point() +
	facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
	ggtitle("Predicted probabilities versus factor variable") +
	scale_x_continuous(trans = scales::logit_trans(), breaks = prob_breaks)

	# ------------------------------------------------------------------------------

	boost_in_sample_predictions %>%
	roc_curve(truth = Class, .pred_Impaired) %>%
	autoplot()

	boost_in_sample_predictions %>%
	pr_curve(truth = Class, .pred_Impaired) %>%
	autoplot()