statwonk/prediction_intervals.R

## prediction_intervals.R
library(tidyverse)

tibble(boot = seq_len(1e2)) %>%
  mutate(prediction_intervals = map(boot, function(i) {
    rpois(100, 10) -> x

    (sum(x[6:100]) -> total_oracle)

    # Given the first five, what can we say about the sum of 95 more?
    x[1:5] -> first_five

    sum(first_five)*19 # a crude guess

    seq(1e1, 3e3, 1e2) %>%
      tibble(possible_sum_of_remaining_95 = .) %>%
      mutate(lower = map_dbl(possible_sum_of_remaining_95, function(x) {
        1 - pbinom(sum(first_five) - 1, sum(first_five) + x, prob = 5/100)
      }),
      upper = map_dbl(possible_sum_of_remaining_95, function(x) {
        pbinom(sum(first_five), sum(first_five) + x, 5/100)
      })) %>%
      filter(lower > 0.01/2,
             upper > 0.01/2) %>%
      summarise(lower = possible_sum_of_remaining_95[lower == min(lower)],
                upper = possible_sum_of_remaining_95[upper == min(upper)]) %>%
      mutate(actual = total_oracle) %>%
      select(lower, actual, upper)
  })) %>%
  tidyr::unnest() %>%
  ggplot(aes(x = factor(boot))) +
  geom_errorbar(aes(ymin = lower, ymax = upper,
                    color = forcats::fct_rev(ifelse(lower <= actual & actual <= upper,
                                                    "Contained by interval", "Not contained"))),
                size = 0.5) +
  scale_color_discrete(name = "", guide = guide_legend(override.aes = list(size = 10))) +
  geom_point(aes(y = actual)) +
  coord_flip() +
  theme_bw(16) +
  xlab("Bootstrap") + ylab("Units in future 95 periods") +
  theme(axis.text.y = element_text(size = 6), axis.text.x = element_text(size = 15),
        panel.grid = element_blank(),
        legend.position = "top") +
  ggtitle("99% prediction interval for units observed in the next 95 periods\ngiven five periods already observed")

# Verify coverage
tibble(boot = seq_len(1e4)) %>%
  mutate(prediction_intervals = parallel::mclapply(boot, function(i) {
    rpois(100, 10) -> x

    (sum(x[6:100]) -> total_oracle)

    # Given the first five, what can we say about the sum of 95 more?
    x[1:5] -> first_five

    seq(1e1, 5e3, 1) %>%
      tibble(possible_sum_of_remaining_95 = .) %>%
      mutate(lower = map_dbl(possible_sum_of_remaining_95, function(x) {
        1 - pbinom(sum(first_five) - 1, sum(first_five) + x, prob = 5/100)
      }),
      upper = map_dbl(possible_sum_of_remaining_95, function(x) {
        pbinom(sum(first_five), sum(first_five) + x, 5/100)
      })) -> x

    x %>% filter(lower >= 0.01/2) %>%
      summarise(lower = possible_sum_of_remaining_95[lower == min(lower)]) %>%
      mutate(actual = total_oracle) %>%
      select(lower, actual) %>%
      bind_cols(
        x %>% filter(upper >= 0.01/2) %>%
          summarise(upper = possible_sum_of_remaining_95[upper == min(upper)]) %>%
          select(upper)
      )
  }, mc.cores = 4)) %>%
  tidyr::unnest() %>%
  mutate(contained = forcats::fct_rev(ifelse(lower <= actual & actual <= upper,
                                             "Contained by interval", "Not contained"))) %>%
  count(contained) %>%
  mutate(pct = n/sum(n))
	library(tidyverse)

	tibble(boot = seq_len(1e2)) %>%
	mutate(prediction_intervals = map(boot, function(i) {
	rpois(100, 10) -> x

	(sum(x[6:100]) -> total_oracle)

	# Given the first five, what can we say about the sum of 95 more?
	x[1:5] -> first_five

	sum(first_five)*19 # a crude guess

	seq(1e1, 3e3, 1e2) %>%
	tibble(possible_sum_of_remaining_95 = .) %>%
	mutate(lower = map_dbl(possible_sum_of_remaining_95, function(x) {
	1 - pbinom(sum(first_five) - 1, sum(first_five) + x, prob = 5/100)
	}),
	upper = map_dbl(possible_sum_of_remaining_95, function(x) {
	pbinom(sum(first_five), sum(first_five) + x, 5/100)
	})) %>%
	filter(lower > 0.01/2,
	upper > 0.01/2) %>%
	summarise(lower = possible_sum_of_remaining_95[lower == min(lower)],
	upper = possible_sum_of_remaining_95[upper == min(upper)]) %>%
	mutate(actual = total_oracle) %>%
	select(lower, actual, upper)
	})) %>%
	tidyr::unnest() %>%
	ggplot(aes(x = factor(boot))) +
	geom_errorbar(aes(ymin = lower, ymax = upper,
	color = forcats::fct_rev(ifelse(lower <= actual & actual <= upper,
	"Contained by interval", "Not contained"))),
	size = 0.5) +
	scale_color_discrete(name = "", guide = guide_legend(override.aes = list(size = 10))) +
	geom_point(aes(y = actual)) +
	coord_flip() +
	theme_bw(16) +
	xlab("Bootstrap") + ylab("Units in future 95 periods") +
	theme(axis.text.y = element_text(size = 6), axis.text.x = element_text(size = 15),
	panel.grid = element_blank(),
	legend.position = "top") +
	ggtitle("99% prediction interval for units observed in the next 95 periods\ngiven five periods already observed")

	# Verify coverage
	tibble(boot = seq_len(1e4)) %>%
	mutate(prediction_intervals = parallel::mclapply(boot, function(i) {
	rpois(100, 10) -> x

	(sum(x[6:100]) -> total_oracle)

	# Given the first five, what can we say about the sum of 95 more?
	x[1:5] -> first_five

	seq(1e1, 5e3, 1) %>%
	tibble(possible_sum_of_remaining_95 = .) %>%
	mutate(lower = map_dbl(possible_sum_of_remaining_95, function(x) {
	1 - pbinom(sum(first_five) - 1, sum(first_five) + x, prob = 5/100)
	}),
	upper = map_dbl(possible_sum_of_remaining_95, function(x) {
	pbinom(sum(first_five), sum(first_five) + x, 5/100)
	})) -> x

	x %>% filter(lower >= 0.01/2) %>%
	summarise(lower = possible_sum_of_remaining_95[lower == min(lower)]) %>%
	mutate(actual = total_oracle) %>%
	select(lower, actual) %>%
	bind_cols(
	x %>% filter(upper >= 0.01/2) %>%
	summarise(upper = possible_sum_of_remaining_95[upper == min(upper)]) %>%
	select(upper)
	)
	}, mc.cores = 4)) %>%
	tidyr::unnest() %>%
	mutate(contained = forcats::fct_rev(ifelse(lower <= actual & actual <= upper,
	"Contained by interval", "Not contained"))) %>%
	count(contained) %>%
	mutate(pct = n/sum(n))