Skip to content

Instantly share code, notes, and snippets.

@steveharoz
Last active September 15, 2019 11:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save steveharoz/49f90b81ccb1fcae9bedd635f0b6aa72 to your computer and use it in GitHub Desktop.
Save steveharoz/49f90b81ccb1fcae9bedd635f0b6aa72 to your computer and use it in GitHub Desktop.
peeking
library(tidyverse)
# simulate collecting data, checking the p-value, and collecting more data and checking more measures (dependent variables) if the p value doesn't reach a significance threshold
simulate_peeking_and_DVs = function(number_of_peeks = 5, DVs = 1, initial_size = 10, amount_added_after_peeking = 5) {
data = list()
# simulate some collected data
for (dv in 1:DVs) {
data[[dv]] = rnorm(initial_size)
}
iteration = 0
repeat{
iteration = iteration + 1
# run a t-test on each DV
if (DVs == 1)
ps = t.test(data[[1]])$p.value
else # multiple DVs
ps = sapply(data, function(d) t.test(d)$p.value, simplify = TRUE)
# if any DV's test passes the significance threshold, or if max data peeks reached, stop
if (min(ps) < 0.05 || iteration > number_of_peeks)
break
# collect more data for each DV
for (dv in 1:DVs) {
data[[dv]] = c(data[[dv]], rnorm(amount_added_after_peeking))
}
}
# return the smallest p-value
return (min(ps))
}
SIMULATION_COUNT = 2000
# simulate runs with data peeking and multiple DVs
peeking_data = expand.grid(peeks = c(0:5, 10), DVs = 1:4, initial_size = 20, amount_added_after_peeking = 5) %>%
rowwise() %>%
mutate(alpha = mean(replicate(SIMULATION_COUNT, simulate_peeking_and_DVs(peeks, DVs, initial_size, amount_added_after_peeking) < 0.05, simplify = TRUE)))
ggplot(peeking_data) +
aes(x=peeks, y=alpha, color = factor(DVs)) +
geom_hline(yintercept = 0.05, alpha = 0.5) +
geom_line(size = 1) +
#geom_smooth(aes(fill=factor(DVs)), method = "lm") +
scale_x_continuous(breaks = c(0:5, 10), expand = c(0,0)) +
scale_y_continuous(breaks = c(0, .05, .1, .2, .3, .4, .5), expand = c(0,0)) +
expand_limits(y=c(0, .5)) +
guides(color = guide_legend(reverse = T)) +
theme_classic(18) + theme(plot.subtitle = element_text("Segoe UI light")) +
labs(
title = "The impact of peeking at data to determine if data collection should continue or if \nmore dependent variables should be checked",
subtitle = "Likelihood of getting p<0.05 despite sampling from a null distribution",
y = NULL,
color = "Dependent variables", fill = "Dependent variables")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment