Last active
February 12, 2022 13:49
-
-
Save baogorek/edb6db76054df5cdad9a488ebd99e651 to your computer and use it in GitHub Desktop.
Simulation demonstrating the Frequentist Properties of Bayesian Sequential Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(brms) | |
library(qcc) | |
#set.seed(1569) # First 100 | |
set.seed(8536) # Next 1000 | |
# B <- 100 # First time through | |
B <- 1000 | |
N_A <- 10 | |
N_B <- 10 | |
beta <- 0 | |
initial_n <- 4 | |
# interleave A and B and then continue with the one that has more | |
most_frequent <- ifelse(N_A > N_B, "A", "B") | |
trt <- c(rep(c("A", "B"), times = min(N_A, N_B)), rep(most_frequent, abs(N_A - N_B))) | |
# Frequentist multiple datasets under the null hypothesis | |
p_sequences <- list() | |
df_list <- list() # For use with the bayesian procedure | |
for (b in 1:B) { | |
epsilon <- rnorm(N_A + N_B) | |
y <- 30 + beta * (trt == "B") + epsilon | |
df <- data.frame(y=y, trt=trt) | |
df_list <- append(df_list, list(df)) | |
p_values <- c() | |
n_seq <- initial_n:nrow(df) | |
for (n in n_seq) { | |
df_n <- df[1:n, ] | |
freq_lm <- lm(y ~ trt, data=df_n) | |
p_values <- c(p_values, as.numeric(summary(freq_lm)$coefficients[, 4][2])) | |
} | |
p_sequences <- append(p_sequences, list(p_values)) | |
} | |
qcc(df_list[[1]]$y, type = 'xbar.one', ylab = 'y', xlab = 'time') | |
p_df <- as.data.frame(p_sequences) | |
names(p_df) <- paste0("MC", 1:B) | |
p_df$n <- n_seq | |
# Let's see at n=20, the batch analysis, how many were rejected | |
terminal_p <- p_df %>% filter(n == 20) %>% select(starts_with("MC")) %>% unlist() | |
mean(terminal_p < .05) | |
p_long <- p_df %>% | |
pivot_longer(., cols=starts_with("MC")) %>% | |
select(mc_iter=name, n=n, p_value=value) %>% | |
arrange(mc_iter, n) | |
reject_df <- p_long %>% | |
group_by(mc_iter) %>% | |
summarize(min_p = min(p_value), | |
reject = min(p_value) < .05) | |
mean(reject_df$reject) | |
p_long <- p_long %>% | |
inner_join(reject_df, by="mc_iter") %>% | |
arrange(mc_iter, n) | |
reject_df$color <- "grey" | |
to_display <- sample(reject_df %>% filter(reject) %>% pull(mc_iter), 1) | |
reject_df[reject_df$mc_iter == to_display[1], "color"] <- "red" | |
reject_df$alpha <- .05 | |
reject_df[reject_df$mc_iter %in% to_display, "alpha"] <- 1 | |
ggplot(p_long, aes(x=n, y=p_value, col=mc_iter)) + | |
geom_line(size=1, alpha=.3, show.legend = FALSE) + | |
geom_hline(yintercept=.05, lty=2, col="red") + | |
xlim(c(2.5, 20)) + | |
ylim(c(0, 1)) + | |
scale_color_manual(values=reject_df$color) + | |
scale_alpha_manual(values=reject_df$alpha) + | |
ylab("p-value") + | |
xlab("n") + | |
theme(axis.text=element_text(size=12), | |
axis.title=element_text(size=12)) | |
# Bayesian regression under the null hypothesis ----------- | |
b_prior_str <- "normal(0, .75)" | |
bayes_seed <- 91589 | |
set.seed(bayes_seed) | |
cred_sequences <- list() | |
for (b in 1:B) { | |
rep_start_time <- Sys.time() | |
n_seq <- initial_n:nrow(df) | |
df <- df_list[[b]] | |
df_seq_list <- lapply(4:nrow(df), function(i) df[1:i, ]) | |
bayes_lm <- brm_multiple(y ~ trt, | |
data=df_seq_list, | |
family = gaussian(), | |
prior = c( | |
set_prior("normal(30, 2)", class = "Intercept"), | |
set_prior(b_prior_str, class = "b"), | |
set_prior("normal(1, 0.1)", class = "sigma") | |
), combine=FALSE) | |
post_probs <- c() | |
for (i in 1:length(bayes_lm)) { | |
samples_i <- as.numeric(unlist(as_draws_list(bayes_lm[[i]], variable='b_trtB'))) | |
p_mean_above_or_below_0 <- min(mean(samples_i < 0), mean(samples_i > 0)) | |
post_probs <- c(post_probs, p_mean_above_or_below_0) | |
} | |
cred_sequences <- append(cred_sequences, list(post_probs)) | |
gc(TRUE) | |
cat("\n\nFinished rep", b, "which took\n") | |
print(Sys.time() - rep_start_time) | |
} | |
p_df <- as.data.frame(cred_sequences) | |
names(p_df) <- paste0("MC", 1:B) | |
p_df$n <- n_seq | |
# Let's see at n=20, the batch analysis, how many were rejected | |
terminal_p <- p_df %>% filter(n == 20) %>% select(starts_with("MC")) %>% unlist() | |
mean(terminal_p * 2 < .05) | |
p_long <- p_df %>% | |
pivot_longer(., cols=starts_with("MC")) %>% | |
select(mc_iter=name, n=n, pr_above_or_below=value) %>% | |
arrange(mc_iter, n) | |
# For credible interval interpretation of hypothesis testing, alpha / 2 area | |
# is to either side of the credible interval | |
reject_df <- p_long %>% | |
group_by(mc_iter) %>% | |
summarize(min_p = min(pr_above_or_below), | |
reject = min(pr_above_or_below) < .05 / 2) | |
mean(reject_df$reject) | |
p_long <- p_long %>% | |
inner_join(reject_df, by="mc_iter") %>% | |
arrange(mc_iter, n) | |
reject_df$color <- "grey" | |
to_display <- sample(reject_df %>% filter(reject) %>% pull(mc_iter), 2)[2] | |
reject_df[reject_df$mc_iter == to_display[1], "color"] <- "red" | |
reject_df$alpha <- .05 | |
reject_df[reject_df$mc_iter %in% to_display, "alpha"] <- 1 | |
# Multiplying pr by 2 bc the credible interval would have .025 on each side | |
ggplot(p_long, aes(x=n, y=pr_above_or_below * 2, col=mc_iter)) + | |
geom_line(size=1, alpha=.3, show.legend = FALSE) + | |
geom_hline(yintercept=.05, lty=2, col="red") + | |
xlim(c(2.5, 20)) + | |
ylim(c(0, 1)) + | |
scale_color_manual(values=reject_df$color) + | |
scale_alpha_manual(values=reject_df$alpha) + | |
ylab("Inverted Credible Interval probability") + | |
xlab("n") + | |
theme(axis.text=element_text(size=12), | |
axis.title=element_text(size=12)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment