Christopher Peters statwonk

## bootstrapping.R
library(tidyverse)
library(gganimate)

# sample of 100 data points
tibble(x = rexp(1e2)) -> d

# 20 bootstrap samples, you likely want several orders of magnitude more.
tibble(boot_samples = seq_len(2e1)) %>%
  mutate(data = map(boot_samples, ~ d %>% sample_frac(1, replace = TRUE))) %>%
  unnest(cols = c(data)) %>%

## retweets.R
library(fitdistrplus)
library(countreg)
library(tidyverse)
library(rtweet)

get_timeline("statwonk", n = 1e4) %>%
  filter(!is_retweet) %>%
  pull(retweet_count) -> my_retweets

# I know a priori it's negbin,

## rootograms.R
# install.packages("countreg", repos="http://R-Forge.R-project.org")
# https://www.fromthebottomoftheheap.net/2016/06/07/rootograms/
# https://channel9.msdn.com/Events/useR-international-R-User-conferences/useR-International-R-User-2017-Conference/countreg-Tools-for-count-data-regression
library(countreg)

rzinbinom(3e3, size = 4, mu = 20, pi = 0.05) -> x
table(x)
hist(x, col = "orange")

rootogram(glm(x ~ 1, family = "poisson")) # zeros under fit

## durations_to_counts.R
library(tidyverse)
# https://www.kellogg.northwestern.edu/faculty/weber/decs-430/Notes%20on%20the%20Poisson%20and%20exponential%20distributions.pdf
# http://www.mscs.mu.edu/~jsta/issues/11(3)/JSTA11(3)p2.pdf

# Case 1: memoryless intra-arrival times. They're not correlated.
tibble::tibble(
  durations_between = rexp(3e4, rate = 20)
) %>%
  mutate(time = cumsum(durations_between)) %>%
  mutate(time_interval = cut(time, breaks = seq(0, 1e4, 1), labels = FALSE)) %>%

## simple_multilevel.R
library(tidyverse)
library(brms)
library(tidybayes)
library(ggthemes)

# Simulation settings
innings <- 9
games <- 162
number_of_opposing_pitchers <- 20 # purposly set low to illustrate variation
opposing_pitchers <- rnorm(number_of_opposing_pitchers, sd = 0.5) # a set of opposing pitcher effects

## beta-distribution.R
library(tidyverse)
plot.new()
par(bg = "black")
plot(density(rbeta(1e6, 50, 50)), col = "pink3", lwd = 1, ylim = c(0, 8),
     xlab = "", ylab = "", main = "Beta distribution", axes = FALSE,
     col.main = 'white')
map2(c(seq(1, 49, 1), 50),
     c(seq(1, 49, 1), 50),
     function(x, y){ message(x,y);rbeta(1e6, x + 1, y + 1)}) %>%
  map(function(x) { density(x) %>%

## risk_adds_up.R
library(tidyverse)
expand.grid(
  risk = seq(0.001, 0.02, 0.001),
  units_of_exposure = seq_len(24*7)
) %>% as_tibble() %>%
  mutate(total_risk = map2_dbl(risk, units_of_exposure, ~ 1 - (1 - .x)^(.y))) %>%
  ggplot(aes(x = risk, y = units_of_exposure)) +
  geom_raster(aes(fill = total_risk), alpha = 0.90) +
  scale_fill_gradient2(name = "Total chance",
                       low = "white", mid = "white", high = "purple4", midpoint = 0.5,

## outages.R
# https://news.ycombinator.com/item?id=20356610
# chance of the cluster size being >5 when we can cluster events within a one-week period
# and the rate of events is one every 2 months (debatable, but feels right to me)?

# My re-write:
# probability of more than four outages in a given week given a rate of every other month
# _assuming_: totally unrelated infra (none share AWS, etc.)
ppois(4, lambda = 1/2/4.5, lower.tail = FALSE) # ~ 1/7M chance.

# Given that we've observed five in one-week,

## quantile_analysis.R
library(tidyverse)
library(survival)

rweibull_cens <- function(n, shape, scale) {
  # http://statwonk.com/weibull.html
  a_random_death_time <- rweibull(n, shape = shape, scale = scale)
  a_random_censor_time <- rweibull(n, shape = shape, scale = scale)
  observed_time <- pmin(a_random_censor_time, a_random_death_time)
  censor <- observed_time == a_random_death_time
  tibble(time = observed_time, censor = censor)

## minimal_mlm.R
library(brms)
library(tidybayes)

N <- 1e3
number_of_groups <- 12
group_means <- rnorm(number_of_groups)

tibble(obs = seq_len(N)) %>%
  mutate(group = sample(seq_len(number_of_groups), n(), replace = TRUE),
         contribution = group_means[group],
	library(tidyverse)
	library(gganimate)

	# sample of 100 data points
	tibble(x = rexp(1e2)) -> d

	# 20 bootstrap samples, you likely want several orders of magnitude more.
	tibble(boot_samples = seq_len(2e1)) %>%
	mutate(data = map(boot_samples, ~ d %>% sample_frac(1, replace = TRUE))) %>%
	unnest(cols = c(data)) %>%
	library(fitdistrplus)
	library(countreg)
	library(tidyverse)
	library(rtweet)

	get_timeline("statwonk", n = 1e4) %>%
	filter(!is_retweet) %>%
	pull(retweet_count) -> my_retweets

	# I know a priori it's negbin,
	# install.packages("countreg", repos="http://R-Forge.R-project.org")
	# https://www.fromthebottomoftheheap.net/2016/06/07/rootograms/
	# https://channel9.msdn.com/Events/useR-international-R-User-conferences/useR-International-R-User-2017-Conference/countreg-Tools-for-count-data-regression
	library(countreg)

	rzinbinom(3e3, size = 4, mu = 20, pi = 0.05) -> x
	table(x)
	hist(x, col = "orange")

	rootogram(glm(x ~ 1, family = "poisson")) # zeros under fit
	library(tidyverse)
	# https://www.kellogg.northwestern.edu/faculty/weber/decs-430/Notes%20on%20the%20Poisson%20and%20exponential%20distributions.pdf
	# http://www.mscs.mu.edu/~jsta/issues/11(3)/JSTA11(3)p2.pdf

	# Case 1: memoryless intra-arrival times. They're not correlated.
	tibble::tibble(
	durations_between = rexp(3e4, rate = 20)
	) %>%
	mutate(time = cumsum(durations_between)) %>%
	mutate(time_interval = cut(time, breaks = seq(0, 1e4, 1), labels = FALSE)) %>%
	library(tidyverse)
	library(brms)
	library(tidybayes)
	library(ggthemes)

	# Simulation settings
	innings <- 9
	games <- 162
	number_of_opposing_pitchers <- 20 # purposly set low to illustrate variation
	opposing_pitchers <- rnorm(number_of_opposing_pitchers, sd = 0.5) # a set of opposing pitcher effects
	library(tidyverse)
	plot.new()
	par(bg = "black")
	plot(density(rbeta(1e6, 50, 50)), col = "pink3", lwd = 1, ylim = c(0, 8),
	xlab = "", ylab = "", main = "Beta distribution", axes = FALSE,
	col.main = 'white')
	map2(c(seq(1, 49, 1), 50),
	c(seq(1, 49, 1), 50),
	function(x, y){ message(x,y);rbeta(1e6, x + 1, y + 1)}) %>%
	map(function(x) { density(x) %>%
	library(tidyverse)
	expand.grid(
	risk = seq(0.001, 0.02, 0.001),
	units_of_exposure = seq_len(24*7)
	) %>% as_tibble() %>%
	mutate(total_risk = map2_dbl(risk, units_of_exposure, ~ 1 - (1 - .x)^(.y))) %>%
	ggplot(aes(x = risk, y = units_of_exposure)) +
	geom_raster(aes(fill = total_risk), alpha = 0.90) +
	scale_fill_gradient2(name = "Total chance",
	low = "white", mid = "white", high = "purple4", midpoint = 0.5,
	# https://news.ycombinator.com/item?id=20356610
	# chance of the cluster size being >5 when we can cluster events within a one-week period
	# and the rate of events is one every 2 months (debatable, but feels right to me)?

	# My re-write:
	# probability of more than four outages in a given week given a rate of every other month
	# _assuming_: totally unrelated infra (none share AWS, etc.)
	ppois(4, lambda = 1/2/4.5, lower.tail = FALSE) # ~ 1/7M chance.

	# Given that we've observed five in one-week,
	library(tidyverse)
	library(survival)

	rweibull_cens <- function(n, shape, scale) {
	# http://statwonk.com/weibull.html
	a_random_death_time <- rweibull(n, shape = shape, scale = scale)
	a_random_censor_time <- rweibull(n, shape = shape, scale = scale)
	observed_time <- pmin(a_random_censor_time, a_random_death_time)
	censor <- observed_time == a_random_death_time
	tibble(time = observed_time, censor = censor)
	library(brms)
	library(tidybayes)

	N <- 1e3
	number_of_groups <- 12
	group_means <- rnorm(number_of_groups)

	tibble(obs = seq_len(N)) %>%
	mutate(group = sample(seq_len(number_of_groups), n(), replace = TRUE),
	contribution = group_means[group],