brshallo/comparing-scaling-methods.md

## comparing-scaling-methods.md

      
    Raw
  

              comparing-scaling-methods.md
            
          
    library(tidyverse)

# load in source_rmd()
devtools::source_gist("https://gist.github.com/noamross/a549ee50e8a4fd68b8b1")
#> Sourcing https://gist.githubusercontent.com/noamross/a549ee50e8a4fd68b8b1/raw/40960d8280438b7e2e8d6502f3c1d4ad348caeb6/source_rmd.R
#> SHA-1 hash of file is 624ae941e51dee522994e014928448170cedf1a3
# Also could have used solution here: 
# https://stackoverflow.com/questions/10966109/how-to-source-r-markdown-file-like-sourcemyfile-r

# Source code from blog post
source_rmd("https://raw.githubusercontent.com/brshallo/brshallo/master/content/post/2020-11-23-remember-resampling-techniques-change-the-base-rates-of-your-predictions.Rmd", skip_plots = TRUE)
#> processing file: https://raw.githubusercontent.com/brshallo/brshallo/master/content/post/2020-11-23-remember-resampling-techniques-change-the-base-rates-of-your-predictions.Rmd
#> output file: C:\Users\BSHALLOW\AppData\Local\Temp\RtmpwPXH3e\file48706e763d0e.R

classadjust <- function(condprobs, wrongprob, trueprob) {
  a <- condprobs / (wrongprob / trueprob)
  comp_cond <- 1 - condprobs
  comp_wrong <- 1 - wrongprob
  comp_true <- 1 - trueprob
  b <- comp_cond / (comp_wrong / comp_true)
  return(a / (a + b))
}

offset_intercept <- function(true_baserate, sample_baserate){
  log((base_rate / (1 - base_rate)) * ((1 - sample_baserate) / sample_baserate))
}

lodds_to_prob <- function(x) exp(x) / (exp(x) + 1)

base_rate <- summarise(train, prob = sum(target) / n()) %>% pull(prob)

offset <- offset_intercept(base_rate, 0.5)

# Different approaches to rescaling predictions (platt scaling, offset, adjust) 
test_preds_scaling_approaches <- test %>% 
  # preds when no resampling
  modelr::spread_predictions(mod_5_95, mod_50_50) %>% 
  mutate(pred = mod_50_50) %>%
  # preds when resampling then platt scaling
  spread_predictions(mod_50_50_rescaled_calibrated) %>%
  select(-pred) %>% 
  # preds when using intercept offset
  mutate(mod50_offset = mod_50_50 + offset) %>% 
  mutate(across(contains("mod"), list(pred = convert_lodds))) %>% 
  # preds when adjusting
  mutate(mod50_adjust_pred = classadjust(mod_50_50_pred, 0.50, base_rate)) %>% 
  rename(
    formula_adjusted = mod50_adjust_pred,
    offset = mod50_offset_pred,
    platt_scaled = mod_50_50_rescaled_calibrated_pred,
    unaltered = mod_5_95_pred
  ) %>% 
  # `feature` just represented the probability of an event being TRUE, so
  # can just convert to probability and now serves as the actual
  mutate(actual = lodds_to_prob(feature))

# All plotted and compared against preds when not doing any adjustment
test_preds_scaling_approaches %>% 
  ggplot(aes(x = feature))+
  geom_line(aes(y = formula_adjusted, colour = "adjusted (after downsample)"))+
  geom_line(aes(y = offset, colour = "offset (after downsample)"), linetype = "dashed")+
  geom_line(aes(y = platt_scaled, colour = "platt scaled (after downsample)"))+
  geom_line(aes(y = unaltered, colour = "unaltered (no downsampling)"), linetype = "dashed")+
  geom_line(aes(y = actual, colour = "actual probability"))+
  labs(y = "predicted probability", 
       colour = NULL,
       caption = 
         "In this univariate example 'platt scaled' produces the same results as when doing no resampling.
       \nIs closer to actual probability at tail compared to offset/adjusted approach (in this case).")+
  theme(legend.position = "bottom")+
  guides(colour = guide_legend(ncol = 2))

SIDENOTE

Seems like an approach I've used for visualizing relationships between likelihoods and categories my
Seems that this approach may not be perfect at the tails of the distribution...
actuals_data <- test_preds_scaling_approaches %>% 
  mutate(ntile = ntile(feature, 50)) %>% 
  group_by(ntile) %>% 
  mutate(feature = median(feature)) %>% 
  group_by(feature) %>% 
  summarise(actual_observed = sum(target) / n())
#> `summarise()` ungrouping output (override with `.groups` argument)

actuals_data %>% 
  ggplot(aes(x = feature, y = actual_observed))+
  geom_line()+
  geom_smooth()+
  geom_line(aes(y = actual, colour = "actual"), data = test_preds_scaling_approaches)
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Have used this or similar binning approaches in other places, e.g. https://stats.stackexchange.com/a/391125/193123
^{Created on 2020-12-04 by the reprex package (v0.3.0)}