Created
June 25, 2019 22:49
-
-
Save statwonk/33fc1b2e6b004d93750b295ea47c097a to your computer and use it in GitHub Desktop.
An example of checking the statistical coverage properties of an algorithm.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
tibble(boot = seq_len(1e2)) %>% | |
mutate(prediction_intervals = map(boot, function(i) { | |
rpois(100, 10) -> x | |
(sum(x[6:100]) -> total_oracle) | |
# Given the first five, what can we say about the sum of 95 more? | |
x[1:5] -> first_five | |
sum(first_five)*19 # a crude guess | |
seq(1e1, 3e3, 1e2) %>% | |
tibble(possible_sum_of_remaining_95 = .) %>% | |
mutate(lower = map_dbl(possible_sum_of_remaining_95, function(x) { | |
1 - pbinom(sum(first_five) - 1, sum(first_five) + x, prob = 5/100) | |
}), | |
upper = map_dbl(possible_sum_of_remaining_95, function(x) { | |
pbinom(sum(first_five), sum(first_five) + x, 5/100) | |
})) %>% | |
filter(lower > 0.01/2, | |
upper > 0.01/2) %>% | |
summarise(lower = possible_sum_of_remaining_95[lower == min(lower)], | |
upper = possible_sum_of_remaining_95[upper == min(upper)]) %>% | |
mutate(actual = total_oracle) %>% | |
select(lower, actual, upper) | |
})) %>% | |
tidyr::unnest() %>% | |
ggplot(aes(x = factor(boot))) + | |
geom_errorbar(aes(ymin = lower, ymax = upper, | |
color = forcats::fct_rev(ifelse(lower <= actual & actual <= upper, | |
"Contained by interval", "Not contained"))), | |
size = 0.5) + | |
scale_color_discrete(name = "", guide = guide_legend(override.aes = list(size = 10))) + | |
geom_point(aes(y = actual)) + | |
coord_flip() + | |
theme_bw(16) + | |
xlab("Bootstrap") + ylab("Units in future 95 periods") + | |
theme(axis.text.y = element_text(size = 6), axis.text.x = element_text(size = 15), | |
panel.grid = element_blank(), | |
legend.position = "top") + | |
ggtitle("99% prediction interval for units observed in the next 95 periods\ngiven five periods already observed") | |
# Verify coverage | |
tibble(boot = seq_len(1e4)) %>% | |
mutate(prediction_intervals = parallel::mclapply(boot, function(i) { | |
rpois(100, 10) -> x | |
(sum(x[6:100]) -> total_oracle) | |
# Given the first five, what can we say about the sum of 95 more? | |
x[1:5] -> first_five | |
seq(1e1, 5e3, 1) %>% | |
tibble(possible_sum_of_remaining_95 = .) %>% | |
mutate(lower = map_dbl(possible_sum_of_remaining_95, function(x) { | |
1 - pbinom(sum(first_five) - 1, sum(first_five) + x, prob = 5/100) | |
}), | |
upper = map_dbl(possible_sum_of_remaining_95, function(x) { | |
pbinom(sum(first_five), sum(first_five) + x, 5/100) | |
})) -> x | |
x %>% filter(lower >= 0.01/2) %>% | |
summarise(lower = possible_sum_of_remaining_95[lower == min(lower)]) %>% | |
mutate(actual = total_oracle) %>% | |
select(lower, actual) %>% | |
bind_cols( | |
x %>% filter(upper >= 0.01/2) %>% | |
summarise(upper = possible_sum_of_remaining_95[upper == min(upper)]) %>% | |
select(upper) | |
) | |
}, mc.cores = 4)) %>% | |
tidyr::unnest() %>% | |
mutate(contained = forcats::fct_rev(ifelse(lower <= actual & actual <= upper, | |
"Contained by interval", "Not contained"))) %>% | |
count(contained) %>% | |
mutate(pct = n/sum(n)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment