Skip to content

Instantly share code, notes, and snippets.

@topepo
Created May 6, 2021 22:00
Show Gist options
  • Save topepo/8e02eecf8a94bce915014371eae93217 to your computer and use it in GitHub Desktop.
Save topepo/8e02eecf8a94bce915014371eae93217 to your computer and use it in GitHub Desktop.
two class diagnostic plots for shinymodels
library(tidymodels)
tidymodels_prefer()
theme_set(theme_bw())
library(doMC)
registerDoMC(cores = 20)
# ------------------------------------------------------------------------------
data(ad_data)
set.seed(1)
ad_split <- initial_split(ad_data)
ad_train <- training(ad_split)
ad_test <- testing(ad_split)
set.seed(2)
ad_folds <- vfold_cv(ad_train, repeats = 5)
# ------------------------------------------------------------------------------
boost_spec <-
boost_tree() %>%
set_engine("C5.0") %>%
set_mode("classification")
ctrl_rs <- control_resamples(save_pred = TRUE)
boost_res <-
boost_spec %>%
fit_resamples(Class ~ ., resamples = ad_folds, control = ctrl_rs)
boost_in_sample_predictions <- augment(boost_res)
boost_test_res <-
boost_spec %>%
last_fit(Class ~ ., split = ad_split)
boost_test_predictions <- augment(boost_test_res)
# ------------------------------------------------------------------------------
prob_breaks <- (2:9)/10
prob_eps <- 0.001
prob_bins <- 0.025
# ------------------------------------------------------------------------------
boost_in_sample_predictions %>%
ggplot(aes(x = .pred_Impaired)) +
geom_histogram(binwidth = prob_bins, col = "white") +
facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
ggtitle("Predicted probabilities versus true class") +
xlim(0:1)
boost_in_sample_predictions %>%
conf_mat(truth = Class, estimate = .pred_class) %>%
autoplot()
# ------------------------------------------------------------------------------
boost_in_sample_predictions %>%
mutate(
.pred_Impaired =
case_when(
.pred_Impaired > 1 - prob_eps ~ 1 - prob_eps,
.pred_Impaired < prob_eps ~ prob_eps,
TRUE ~ .pred_Impaired
)
) %>%
ggplot(aes(x = p_tau, y = .pred_Impaired)) +
geom_point()+
facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
ggtitle("Predicted probabilities versus numeric variable") +
# We should make a custom transformation that handles probs at 0 and 1
scale_y_continuous(trans = scales::logit_trans(), breaks = prob_breaks)
boost_in_sample_predictions %>%
mutate(
.pred_Impaired =
case_when(
.pred_Impaired > 1 - prob_eps ~ 1 - prob_eps,
.pred_Impaired < prob_eps ~ prob_eps,
TRUE ~ .pred_Impaired
)
) %>%
ggplot(aes(y = Genotype, x = .pred_Impaired)) +
geom_point() +
facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
ggtitle("Predicted probabilities versus factor variable") +
scale_x_continuous(trans = scales::logit_trans(), breaks = prob_breaks)
# ------------------------------------------------------------------------------
boost_in_sample_predictions %>%
roc_curve(truth = Class, .pred_Impaired) %>%
autoplot()
boost_in_sample_predictions %>%
pr_curve(truth = Class, .pred_Impaired) %>%
autoplot()
@topepo
Copy link
Author

topepo commented May 27, 2021

library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#>   method                   from   
#>   required_pkgs.model_spec parsnip
tidymodels_prefer()
theme_set(theme_bw())

library(doMC)
#> Loading required package: foreach
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
registerDoMC(cores = 20)
data(ad_data)
set.seed(1)
ad_split <- initial_split(ad_data)
ad_train <- training(ad_split)
ad_test <- testing(ad_split)

set.seed(2)
ad_folds <- vfold_cv(ad_train, repeats = 5)
boost_spec <-
  boost_tree() %>%
  set_engine("C5.0") %>%
  set_mode("classification")

ctrl_rs <- control_resamples(save_pred = TRUE)

boost_res <-
  boost_spec %>%
  fit_resamples(Class ~ ., resamples = ad_folds, control = ctrl_rs)

boost_in_sample_predictions <- augment(boost_res)

boost_test_res <-
  boost_spec %>%
  last_fit(Class ~ ., split = ad_split)

boost_test_predictions <-  augment(boost_test_res)
prob_breaks <- (2:9)/10
prob_eps <- 0.001
prob_bins <- 0.025
boost_in_sample_predictions %>%
  ggplot(aes(x = .pred_Impaired)) +
  geom_histogram(binwidth = prob_bins, col = "white") +
  facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
  ggtitle("Predicted probabilities versus true class") +
  xlim(0:1)
#> Warning: Removed 4 rows containing missing values (geom_bar).

boost_in_sample_predictions %>%
  conf_mat(truth = Class, estimate = .pred_class) %>%
  autoplot()

boost_in_sample_predictions %>%
  mutate(
    .pred_Impaired =
      case_when(
        .pred_Impaired > 1 - prob_eps ~ 1 - prob_eps,
        .pred_Impaired  <    prob_eps ~     prob_eps,
        TRUE ~ .pred_Impaired
      )
  ) %>%
  ggplot(aes(x = p_tau, y = .pred_Impaired)) +
  geom_point()+
  facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
  ggtitle("Predicted probabilities versus numeric variable") +
  # We should make a custom transformation that handles probs at 0 and 1
  scale_y_continuous(trans = scales::logit_trans(), breaks = prob_breaks)

boost_in_sample_predictions %>%
  mutate(
    .pred_Impaired =
      case_when(
        .pred_Impaired > 1 - prob_eps ~ 1 - prob_eps,
        .pred_Impaired  <    prob_eps ~     prob_eps,
        TRUE ~ .pred_Impaired
      )
  ) %>%
  ggplot(aes(y = Genotype, x = .pred_Impaired)) +
  geom_point() +
  facet_wrap(~ Class, labeller = labeller(Class = label_both), ncol = 1) +
  ggtitle("Predicted probabilities versus factor variable") +
  scale_x_continuous(trans = scales::logit_trans(), breaks = prob_breaks)

boost_in_sample_predictions %>%
  roc_curve(truth = Class, .pred_Impaired) %>%
  autoplot()

boost_in_sample_predictions %>%
  pr_curve(truth = Class, .pred_Impaired) %>%
  autoplot()

Created on 2021-05-27 by the reprex package (v1.0.0.9000)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment