Skip to content

Instantly share code, notes, and snippets.

@statwonk
Created June 25, 2019 22:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save statwonk/33fc1b2e6b004d93750b295ea47c097a to your computer and use it in GitHub Desktop.
Save statwonk/33fc1b2e6b004d93750b295ea47c097a to your computer and use it in GitHub Desktop.
An example of checking the statistical coverage properties of an algorithm.
library(tidyverse)
tibble(boot = seq_len(1e2)) %>%
mutate(prediction_intervals = map(boot, function(i) {
rpois(100, 10) -> x
(sum(x[6:100]) -> total_oracle)
# Given the first five, what can we say about the sum of 95 more?
x[1:5] -> first_five
sum(first_five)*19 # a crude guess
seq(1e1, 3e3, 1e2) %>%
tibble(possible_sum_of_remaining_95 = .) %>%
mutate(lower = map_dbl(possible_sum_of_remaining_95, function(x) {
1 - pbinom(sum(first_five) - 1, sum(first_five) + x, prob = 5/100)
}),
upper = map_dbl(possible_sum_of_remaining_95, function(x) {
pbinom(sum(first_five), sum(first_five) + x, 5/100)
})) %>%
filter(lower > 0.01/2,
upper > 0.01/2) %>%
summarise(lower = possible_sum_of_remaining_95[lower == min(lower)],
upper = possible_sum_of_remaining_95[upper == min(upper)]) %>%
mutate(actual = total_oracle) %>%
select(lower, actual, upper)
})) %>%
tidyr::unnest() %>%
ggplot(aes(x = factor(boot))) +
geom_errorbar(aes(ymin = lower, ymax = upper,
color = forcats::fct_rev(ifelse(lower <= actual & actual <= upper,
"Contained by interval", "Not contained"))),
size = 0.5) +
scale_color_discrete(name = "", guide = guide_legend(override.aes = list(size = 10))) +
geom_point(aes(y = actual)) +
coord_flip() +
theme_bw(16) +
xlab("Bootstrap") + ylab("Units in future 95 periods") +
theme(axis.text.y = element_text(size = 6), axis.text.x = element_text(size = 15),
panel.grid = element_blank(),
legend.position = "top") +
ggtitle("99% prediction interval for units observed in the next 95 periods\ngiven five periods already observed")
# Verify coverage
tibble(boot = seq_len(1e4)) %>%
mutate(prediction_intervals = parallel::mclapply(boot, function(i) {
rpois(100, 10) -> x
(sum(x[6:100]) -> total_oracle)
# Given the first five, what can we say about the sum of 95 more?
x[1:5] -> first_five
seq(1e1, 5e3, 1) %>%
tibble(possible_sum_of_remaining_95 = .) %>%
mutate(lower = map_dbl(possible_sum_of_remaining_95, function(x) {
1 - pbinom(sum(first_five) - 1, sum(first_five) + x, prob = 5/100)
}),
upper = map_dbl(possible_sum_of_remaining_95, function(x) {
pbinom(sum(first_five), sum(first_five) + x, 5/100)
})) -> x
x %>% filter(lower >= 0.01/2) %>%
summarise(lower = possible_sum_of_remaining_95[lower == min(lower)]) %>%
mutate(actual = total_oracle) %>%
select(lower, actual) %>%
bind_cols(
x %>% filter(upper >= 0.01/2) %>%
summarise(upper = possible_sum_of_remaining_95[upper == min(upper)]) %>%
select(upper)
)
}, mc.cores = 4)) %>%
tidyr::unnest() %>%
mutate(contained = forcats::fct_rev(ifelse(lower <= actual & actual <= upper,
"Contained by interval", "Not contained"))) %>%
count(contained) %>%
mutate(pct = n/sum(n))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment