Skip to content

Instantly share code, notes, and snippets.

@baogorek
Last active February 12, 2022 13:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save baogorek/edb6db76054df5cdad9a488ebd99e651 to your computer and use it in GitHub Desktop.
Save baogorek/edb6db76054df5cdad9a488ebd99e651 to your computer and use it in GitHub Desktop.
Simulation demonstrating the Frequentist Properties of Bayesian Sequential Analysis
library(dplyr)
library(tidyr)
library(ggplot2)
library(brms)
library(qcc)
#set.seed(1569) # First 100
set.seed(8536) # Next 1000
# B <- 100 # First time through
B <- 1000
N_A <- 10
N_B <- 10
beta <- 0
initial_n <- 4
# interleave A and B and then continue with the one that has more
most_frequent <- ifelse(N_A > N_B, "A", "B")
trt <- c(rep(c("A", "B"), times = min(N_A, N_B)), rep(most_frequent, abs(N_A - N_B)))
# Frequentist multiple datasets under the null hypothesis
p_sequences <- list()
df_list <- list() # For use with the bayesian procedure
for (b in 1:B) {
epsilon <- rnorm(N_A + N_B)
y <- 30 + beta * (trt == "B") + epsilon
df <- data.frame(y=y, trt=trt)
df_list <- append(df_list, list(df))
p_values <- c()
n_seq <- initial_n:nrow(df)
for (n in n_seq) {
df_n <- df[1:n, ]
freq_lm <- lm(y ~ trt, data=df_n)
p_values <- c(p_values, as.numeric(summary(freq_lm)$coefficients[, 4][2]))
}
p_sequences <- append(p_sequences, list(p_values))
}
qcc(df_list[[1]]$y, type = 'xbar.one', ylab = 'y', xlab = 'time')
p_df <- as.data.frame(p_sequences)
names(p_df) <- paste0("MC", 1:B)
p_df$n <- n_seq
# Let's see at n=20, the batch analysis, how many were rejected
terminal_p <- p_df %>% filter(n == 20) %>% select(starts_with("MC")) %>% unlist()
mean(terminal_p < .05)
p_long <- p_df %>%
pivot_longer(., cols=starts_with("MC")) %>%
select(mc_iter=name, n=n, p_value=value) %>%
arrange(mc_iter, n)
reject_df <- p_long %>%
group_by(mc_iter) %>%
summarize(min_p = min(p_value),
reject = min(p_value) < .05)
mean(reject_df$reject)
p_long <- p_long %>%
inner_join(reject_df, by="mc_iter") %>%
arrange(mc_iter, n)
reject_df$color <- "grey"
to_display <- sample(reject_df %>% filter(reject) %>% pull(mc_iter), 1)
reject_df[reject_df$mc_iter == to_display[1], "color"] <- "red"
reject_df$alpha <- .05
reject_df[reject_df$mc_iter %in% to_display, "alpha"] <- 1
ggplot(p_long, aes(x=n, y=p_value, col=mc_iter)) +
geom_line(size=1, alpha=.3, show.legend = FALSE) +
geom_hline(yintercept=.05, lty=2, col="red") +
xlim(c(2.5, 20)) +
ylim(c(0, 1)) +
scale_color_manual(values=reject_df$color) +
scale_alpha_manual(values=reject_df$alpha) +
ylab("p-value") +
xlab("n") +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=12))
# Bayesian regression under the null hypothesis -----------
b_prior_str <- "normal(0, .75)"
bayes_seed <- 91589
set.seed(bayes_seed)
cred_sequences <- list()
for (b in 1:B) {
rep_start_time <- Sys.time()
n_seq <- initial_n:nrow(df)
df <- df_list[[b]]
df_seq_list <- lapply(4:nrow(df), function(i) df[1:i, ])
bayes_lm <- brm_multiple(y ~ trt,
data=df_seq_list,
family = gaussian(),
prior = c(
set_prior("normal(30, 2)", class = "Intercept"),
set_prior(b_prior_str, class = "b"),
set_prior("normal(1, 0.1)", class = "sigma")
), combine=FALSE)
post_probs <- c()
for (i in 1:length(bayes_lm)) {
samples_i <- as.numeric(unlist(as_draws_list(bayes_lm[[i]], variable='b_trtB')))
p_mean_above_or_below_0 <- min(mean(samples_i < 0), mean(samples_i > 0))
post_probs <- c(post_probs, p_mean_above_or_below_0)
}
cred_sequences <- append(cred_sequences, list(post_probs))
gc(TRUE)
cat("\n\nFinished rep", b, "which took\n")
print(Sys.time() - rep_start_time)
}
p_df <- as.data.frame(cred_sequences)
names(p_df) <- paste0("MC", 1:B)
p_df$n <- n_seq
# Let's see at n=20, the batch analysis, how many were rejected
terminal_p <- p_df %>% filter(n == 20) %>% select(starts_with("MC")) %>% unlist()
mean(terminal_p * 2 < .05)
p_long <- p_df %>%
pivot_longer(., cols=starts_with("MC")) %>%
select(mc_iter=name, n=n, pr_above_or_below=value) %>%
arrange(mc_iter, n)
# For credible interval interpretation of hypothesis testing, alpha / 2 area
# is to either side of the credible interval
reject_df <- p_long %>%
group_by(mc_iter) %>%
summarize(min_p = min(pr_above_or_below),
reject = min(pr_above_or_below) < .05 / 2)
mean(reject_df$reject)
p_long <- p_long %>%
inner_join(reject_df, by="mc_iter") %>%
arrange(mc_iter, n)
reject_df$color <- "grey"
to_display <- sample(reject_df %>% filter(reject) %>% pull(mc_iter), 2)[2]
reject_df[reject_df$mc_iter == to_display[1], "color"] <- "red"
reject_df$alpha <- .05
reject_df[reject_df$mc_iter %in% to_display, "alpha"] <- 1
# Multiplying pr by 2 bc the credible interval would have .025 on each side
ggplot(p_long, aes(x=n, y=pr_above_or_below * 2, col=mc_iter)) +
geom_line(size=1, alpha=.3, show.legend = FALSE) +
geom_hline(yintercept=.05, lty=2, col="red") +
xlim(c(2.5, 20)) +
ylim(c(0, 1)) +
scale_color_manual(values=reject_df$color) +
scale_alpha_manual(values=reject_df$alpha) +
ylab("Inverted Credible Interval probability") +
xlab("n") +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=12))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment