Last active
September 15, 2019 11:11
-
-
Save steveharoz/49f90b81ccb1fcae9bedd635f0b6aa72 to your computer and use it in GitHub Desktop.
peeking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# simulate collecting data, checking the p-value, and collecting more data and checking more measures (dependent variables) if the p value doesn't reach a significance threshold | |
simulate_peeking_and_DVs = function(number_of_peeks = 5, DVs = 1, initial_size = 10, amount_added_after_peeking = 5) { | |
data = list() | |
# simulate some collected data | |
for (dv in 1:DVs) { | |
data[[dv]] = rnorm(initial_size) | |
} | |
iteration = 0 | |
repeat{ | |
iteration = iteration + 1 | |
# run a t-test on each DV | |
if (DVs == 1) | |
ps = t.test(data[[1]])$p.value | |
else # multiple DVs | |
ps = sapply(data, function(d) t.test(d)$p.value, simplify = TRUE) | |
# if any DV's test passes the significance threshold, or if max data peeks reached, stop | |
if (min(ps) < 0.05 || iteration > number_of_peeks) | |
break | |
# collect more data for each DV | |
for (dv in 1:DVs) { | |
data[[dv]] = c(data[[dv]], rnorm(amount_added_after_peeking)) | |
} | |
} | |
# return the smallest p-value | |
return (min(ps)) | |
} | |
SIMULATION_COUNT = 2000 | |
# simulate runs with data peeking and multiple DVs | |
peeking_data = expand.grid(peeks = c(0:5, 10), DVs = 1:4, initial_size = 20, amount_added_after_peeking = 5) %>% | |
rowwise() %>% | |
mutate(alpha = mean(replicate(SIMULATION_COUNT, simulate_peeking_and_DVs(peeks, DVs, initial_size, amount_added_after_peeking) < 0.05, simplify = TRUE))) | |
ggplot(peeking_data) + | |
aes(x=peeks, y=alpha, color = factor(DVs)) + | |
geom_hline(yintercept = 0.05, alpha = 0.5) + | |
geom_line(size = 1) + | |
#geom_smooth(aes(fill=factor(DVs)), method = "lm") + | |
scale_x_continuous(breaks = c(0:5, 10), expand = c(0,0)) + | |
scale_y_continuous(breaks = c(0, .05, .1, .2, .3, .4, .5), expand = c(0,0)) + | |
expand_limits(y=c(0, .5)) + | |
guides(color = guide_legend(reverse = T)) + | |
theme_classic(18) + theme(plot.subtitle = element_text("Segoe UI light")) + | |
labs( | |
title = "The impact of peeking at data to determine if data collection should continue or if \nmore dependent variables should be checked", | |
subtitle = "Likelihood of getting p<0.05 despite sampling from a null distribution", | |
y = NULL, | |
color = "Dependent variables", fill = "Dependent variables") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment