steveharoz/Peeking and DVs.PNG

## Peeking and DVs.PNG

      
    Raw
  

              Peeking and DVs.PNG
            
          
## peeking.PNG

      
    Raw
  

              peeking.PNG
            
          
## peeking.R
library(tidyverse)

# simulate collecting data, checking the p-value, and collecting more data and checking more measures (dependent variables) if the p value doesn't reach a significance threshold
simulate_peeking_and_DVs = function(number_of_peeks = 5, DVs = 1, initial_size = 10, amount_added_after_peeking = 5) {
  data = list()
  # simulate some collected data
  for (dv in 1:DVs) {
    data[[dv]] = rnorm(initial_size)
  }

  iteration = 0
  repeat{
    iteration = iteration + 1

    # run a t-test on each DV
    if (DVs == 1)
      ps = t.test(data[[1]])$p.value
    else # multiple DVs
      ps = sapply(data, function(d) t.test(d)$p.value, simplify = TRUE)

    # if any DV's test passes the significance threshold, or if max data peeks reached, stop
    if (min(ps) < 0.05 || iteration > number_of_peeks)
      break
    # collect more data for each DV
    for (dv in 1:DVs) {
      data[[dv]] = c(data[[dv]], rnorm(amount_added_after_peeking))
    }
  }

  # return the smallest p-value
  return (min(ps))
}


SIMULATION_COUNT = 2000

# simulate runs with data peeking and multiple DVs
peeking_data = expand.grid(peeks = c(0:5, 10), DVs = 1:4, initial_size = 20, amount_added_after_peeking = 5) %>%
  rowwise() %>%
  mutate(alpha = mean(replicate(SIMULATION_COUNT, simulate_peeking_and_DVs(peeks, DVs, initial_size, amount_added_after_peeking) < 0.05, simplify = TRUE)))

ggplot(peeking_data) +
  aes(x=peeks, y=alpha, color = factor(DVs)) +
  geom_hline(yintercept = 0.05, alpha = 0.5) +
  geom_line(size = 1) +
  #geom_smooth(aes(fill=factor(DVs)), method = "lm") +
  scale_x_continuous(breaks = c(0:5, 10), expand = c(0,0)) +
  scale_y_continuous(breaks = c(0, .05, .1, .2, .3, .4, .5), expand = c(0,0)) +
  expand_limits(y=c(0, .5)) +
  guides(color = guide_legend(reverse = T)) +
  theme_classic(18) + theme(plot.subtitle = element_text("Segoe UI light")) +
  labs(
    title = "The impact of peeking at data to determine if data collection should continue or if \nmore dependent variables should be checked",
    subtitle = "Likelihood of getting p<0.05 despite sampling from a null distribution",
    y = NULL,
    color = "Dependent variables", fill = "Dependent variables")
	library(tidyverse)

	# simulate collecting data, checking the p-value, and collecting more data and checking more measures (dependent variables) if the p value doesn't reach a significance threshold
	simulate_peeking_and_DVs = function(number_of_peeks = 5, DVs = 1, initial_size = 10, amount_added_after_peeking = 5) {
	data = list()
	# simulate some collected data
	for (dv in 1:DVs) {
	data[[dv]] = rnorm(initial_size)
	}

	iteration = 0
	repeat{
	iteration = iteration + 1

	# run a t-test on each DV
	if (DVs == 1)
	ps = t.test(data[[1]])$p.value
	else # multiple DVs
	ps = sapply(data, function(d) t.test(d)$p.value, simplify = TRUE)

	# if any DV's test passes the significance threshold, or if max data peeks reached, stop
	if (min(ps) < 0.05 \|\| iteration > number_of_peeks)
	break
	# collect more data for each DV
	for (dv in 1:DVs) {
	data[[dv]] = c(data[[dv]], rnorm(amount_added_after_peeking))
	}
	}

	# return the smallest p-value
	return (min(ps))
	}



	SIMULATION_COUNT = 2000

	# simulate runs with data peeking and multiple DVs
	peeking_data = expand.grid(peeks = c(0:5, 10), DVs = 1:4, initial_size = 20, amount_added_after_peeking = 5) %>%
	rowwise() %>%
	mutate(alpha = mean(replicate(SIMULATION_COUNT, simulate_peeking_and_DVs(peeks, DVs, initial_size, amount_added_after_peeking) < 0.05, simplify = TRUE)))

	ggplot(peeking_data) +
	aes(x=peeks, y=alpha, color = factor(DVs)) +
	geom_hline(yintercept = 0.05, alpha = 0.5) +
	geom_line(size = 1) +
	#geom_smooth(aes(fill=factor(DVs)), method = "lm") +
	scale_x_continuous(breaks = c(0:5, 10), expand = c(0,0)) +
	scale_y_continuous(breaks = c(0, .05, .1, .2, .3, .4, .5), expand = c(0,0)) +
	expand_limits(y=c(0, .5)) +
	guides(color = guide_legend(reverse = T)) +
	theme_classic(18) + theme(plot.subtitle = element_text("Segoe UI light")) +
	labs(
	title = "The impact of peeking at data to determine if data collection should continue or if \nmore dependent variables should be checked",
	subtitle = "Likelihood of getting p<0.05 despite sampling from a null distribution",
	y = NULL,
	color = "Dependent variables", fill = "Dependent variables")