Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
library(tidyverse)

# load in source_rmd()
devtools::source_gist("https://gist.github.com/noamross/a549ee50e8a4fd68b8b1")
#> Sourcing https://gist.githubusercontent.com/noamross/a549ee50e8a4fd68b8b1/raw/40960d8280438b7e2e8d6502f3c1d4ad348caeb6/source_rmd.R
#> SHA-1 hash of file is 624ae941e51dee522994e014928448170cedf1a3
# Also could have used solution here: 
# https://stackoverflow.com/questions/10966109/how-to-source-r-markdown-file-like-sourcemyfile-r

# Source code from blog post
source_rmd("https://raw.githubusercontent.com/brshallo/brshallo/master/content/post/2020-11-23-remember-resampling-techniques-change-the-base-rates-of-your-predictions.Rmd", skip_plots = TRUE)
#> processing file: https://raw.githubusercontent.com/brshallo/brshallo/master/content/post/2020-11-23-remember-resampling-techniques-change-the-base-rates-of-your-predictions.Rmd
#> output file: C:\Users\BSHALLOW\AppData\Local\Temp\RtmpwPXH3e\file48706e763d0e.R

classadjust <- function(condprobs, wrongprob, trueprob) {
  a <- condprobs / (wrongprob / trueprob)
  comp_cond <- 1 - condprobs
  comp_wrong <- 1 - wrongprob
  comp_true <- 1 - trueprob
  b <- comp_cond / (comp_wrong / comp_true)
  return(a / (a + b))
}

offset_intercept <- function(true_baserate, sample_baserate){
  log((base_rate / (1 - base_rate)) * ((1 - sample_baserate) / sample_baserate))
}

lodds_to_prob <- function(x) exp(x) / (exp(x) + 1)

base_rate <- summarise(train, prob = sum(target) / n()) %>% pull(prob)

offset <- offset_intercept(base_rate, 0.5)

# Different approaches to rescaling predictions (platt scaling, offset, adjust) 
test_preds_scaling_approaches <- test %>% 
  # preds when no resampling
  modelr::spread_predictions(mod_5_95, mod_50_50) %>% 
  mutate(pred = mod_50_50) %>%
  # preds when resampling then platt scaling
  spread_predictions(mod_50_50_rescaled_calibrated) %>%
  select(-pred) %>% 
  # preds when using intercept offset
  mutate(mod50_offset = mod_50_50 + offset) %>% 
  mutate(across(contains("mod"), list(pred = convert_lodds))) %>% 
  # preds when adjusting
  mutate(mod50_adjust_pred = classadjust(mod_50_50_pred, 0.50, base_rate)) %>% 
  rename(
    formula_adjusted = mod50_adjust_pred,
    offset = mod50_offset_pred,
    platt_scaled = mod_50_50_rescaled_calibrated_pred,
    unaltered = mod_5_95_pred
  ) %>% 
  # `feature` just represented the probability of an event being TRUE, so
  # can just convert to probability and now serves as the actual
  mutate(actual = lodds_to_prob(feature))

# All plotted and compared against preds when not doing any adjustment
test_preds_scaling_approaches %>% 
  ggplot(aes(x = feature))+
  geom_line(aes(y = formula_adjusted, colour = "adjusted (after downsample)"))+
  geom_line(aes(y = offset, colour = "offset (after downsample)"), linetype = "dashed")+
  geom_line(aes(y = platt_scaled, colour = "platt scaled (after downsample)"))+
  geom_line(aes(y = unaltered, colour = "unaltered (no downsampling)"), linetype = "dashed")+
  geom_line(aes(y = actual, colour = "actual probability"))+
  labs(y = "predicted probability", 
       colour = NULL,
       caption = 
         "In this univariate example 'platt scaled' produces the same results as when doing no resampling.
       \nIs closer to actual probability at tail compared to offset/adjusted approach (in this case).")+
  theme(legend.position = "bottom")+
  guides(colour = guide_legend(ncol = 2))

SIDENOTE

Seems like an approach I've used for visualizing relationships between likelihoods and categories my Seems that this approach may not be perfect at the tails of the distribution...

actuals_data <- test_preds_scaling_approaches %>% 
  mutate(ntile = ntile(feature, 50)) %>% 
  group_by(ntile) %>% 
  mutate(feature = median(feature)) %>% 
  group_by(feature) %>% 
  summarise(actual_observed = sum(target) / n())
#> `summarise()` ungrouping output (override with `.groups` argument)

actuals_data %>% 
  ggplot(aes(x = feature, y = actual_observed))+
  geom_line()+
  geom_smooth()+
  geom_line(aes(y = actual, colour = "actual"), data = test_preds_scaling_approaches)
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Have used this or similar binning approaches in other places, e.g. https://stats.stackexchange.com/a/391125/193123

Created on 2020-12-04 by the reprex package (v0.3.0)

@brshallo

This comment has been minimized.

Copy link
Owner Author

@brshallo brshallo commented Dec 2, 2020

Perhaps should also add reprex on isotonic regression...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment