Lakens/gist:7f516efbcc47de9d40a0

## gistfile1.r
#                               phack

# Requires
# {psych}

# See Also http://rynesherman.com/blog/phack-an-r-function-for-examining-the-effects-of-p-hacking/

phack <- function(initialN=50, hackrate=10, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=100, alpha=.05, alternative="greater", graph=TRUE, sims=100000) {
  require(psych)
  outmat <- matrix(NA, nrow=sims, ncol=6)
  for(i in 1:sims) {
    grp1 <- rnorm(initialN, grp1M, grp1SD)
    grp2 <- rnorm(initialN, grp2M, grp2SD)
    initial.r <- cor(c(grp1,grp2), c(rep(1,length(grp1)),rep(0,length(grp2))))
    hackcount <- 0
    initialp <- t.test(grp1, grp2, alternative=alternative)$p.value
    currp <- initialp
    while(currp > alpha & maxN > length(grp1)) {
      hackcount <- hackcount + 1
      grp1 <- c(grp1, rnorm(hackrate, grp1M, grp1SD))
      grp2 <- c(grp2, rnorm(hackrate, grp2M, grp2SD))
      currp <- t.test(grp1, grp2, alternative=alternative)$p.value
    }
    Nadded <- length(grp1) - initialN
    est.r <- cor(c(grp1, grp2), c(rep(1, length(grp1)),rep(0, length(grp2))))
    outmat[i,] <- c(initialp, hackcount, currp, Nadded, initial.r, est.r)
  }
  outmat <- data.frame(outmat)
  colnames(outmat) <- c("Initial.p", "Hackcount", "Final.p", "NAdded", "Initial.r", "Final.r")
  InitialSigProb <- sum(outmat$Initial.p <= alpha) / sims
  AvgHacks <- mean(outmat$Hackcount)
  FinalSigProb <- sum(outmat$Final.p <= alpha) / sims
  AvgNAdded <- mean(outmat$NAdded)
  AvgTotN <- mean(initialN + outmat$NAdded)
  StopProb <- sum((initialN + outmat$NAdded)==maxN) / sims
  cat("Proportion of Original Samples Statistically Significant =", InitialSigProb, "\n")
  cat("Proportion of Samples Statistically Significant After Hacking =", FinalSigProb, "\n")
  cat("Probability of Stopping Before Reaching Significance =", StopProb, "\n")
  cat("Average Number of Hacks Before Significant/Stopping =", AvgHacks, "\n")
  cat("Average N Added Before Significant/Stopping =", AvgNAdded, "\n")
  cat("Average Total N", AvgTotN, "\n")
  cat("Estimated r without hacking", round(fisherz2r(mean(fisherz(outmat$Initial.r))),2), "\n")
  cat("Estimated r with hacking", round(fisherz2r(mean(fisherz(outmat$Final.r))),2), "\n")
  cat("Estimated r with hacking", round(fisherz2r(mean(fisherz(outmat$Final.r[outmat$Final.p<alpha]))),2), "(non-significant results not included)", "\n")
  if(graph==TRUE) {
    op <- par(mfrow=c(2,1), las=1, font.main=1)
    hist(outmat$Initial.p[outmat$Initial.p<alpha], xlab="p-values", main="P-curve for Initial Study",
         sub=paste("Number of Significant Studies = ", InitialSigProb*sims, " (", InitialSigProb, ")", sep=""), col="cyan", freq=FALSE)
    lines(density(outmat$Initial.p[outmat$Initial.p<alpha]))
    hist(outmat$Final.p[outmat$Final.p<alpha], xlab="p-values", main="P-curve for p-Hacked Study",
         sub=paste("Number of Significant Studies = ", FinalSigProb*sims, " (", FinalSigProb, ")", sep=""), col="cyan", freq=FALSE)
    lines(density(outmat$Final.p[outmat$Final.p<alpha]))
  }
  return(outmat)
}

#P-hack 100000 studies
res <- phack(initialN=50, hackrate=10, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=100, alpha=.05, alternative="two.sided", graph=TRUE, sims=100000)

phack<-(res$Final.p)
phack<-phack[phack<0.05]
length(phack)
hist(phack, main="Collecting 50 participants,\nanalyze data after every participants\nuntil 100 participants are collected", xlab=("Observed p-value"), breaks =10)

#100000 studies with multiple testing

nSims <- 100000 #number of simulated experiments
pmult <-numeric(nSims) #set up empty container for all simulated p-values

for(i in 1:nSims){ #for each simulated experiment
  x<-rnorm(n = 23, mean = 100, sd = 20) #produce  simulated participants
  y1<-rnorm(n = 23, mean = 100, sd = 20) #produce  simulated participants
  y2<-rnorm(n = 23, mean = 100, sd = 20) #produce  simulated participants
  y3<-rnorm(n = 23, mean = 100, sd = 20) #produce  simulated participants
  y4<-rnorm(n = 23, mean = 100, sd = 20) #produce  simulated participants
  y5<-rnorm(n = 23, mean = 100, sd = 20) #produce  simulated participants
  t<-t.test(x,y1) #perform the t-test
  if(t$p.value<0.05) {
    pmult[i]<-t$p.value #get the p-value and store it
  }
  if(t$p.value>0.05) {
    t<-t.test(x,y2) #perform the second t-test
    if(t$p.value<0.05) {
      pmult[i]<-t$p.value #get the p-value and store it
    }
  }
  if(t$p.value>0.05) {
    t<-t.test(x,y3) #perform the third t-test
    if(t$p.value<0.05) {
      pmult[i]<-t$p.value #get the p-value and store it
    }
  }
  if(t$p.value>0.05) {
    t<-t.test(x,y4) #perform the third t-test
    if(t$p.value<0.05) {
      pmult[i]<-t$p.value #get the p-value and store it
    }
  }
  if(t$p.value>0.05) {
    t<-t.test(x,y5) #perform the fifth t-test
    pmult[i]<-t$p.value #get the p-value (regardless of whether significant and store it
  }
}
pmult<-pmult[pmult<0.05]
length(pmult)
hist(pmult, main="Testing exp. condition against 1/5 control conditions", xlab=("Observed p-value"), breaks =10)


#100000 normal studies 50% power
nSims <- 100000 #number of simulated experiments
p <-numeric(nSims) #set up empty container for all simulated p-values

for(i in 1:nSims){ #for each simulated experiment
  x<-rnorm(n = 49, mean = 100, sd = 20) #produce  simulated participants
  y<-rnorm(n = 49, mean = 108, sd = 20) #produce  simulated participants
  t<-t.test(x,y) #perform the t-test
  p[i]<-t$p.value #get the p-value and store it
}
p<-p[p<0.05]


#plots
hist(p, main="100000 studies with 50% power\nonly true effects", xlab=("Observed p-value"), breaks =10)
ptot<-c(p,phack, pmult)
hist(ptot, main="100000 studies with 50% power and\n200000 studies with p-hacking", xlab=("Observed p-value"), breaks =10)
	# phack

	# Requires
	# {psych}

	# See Also http://rynesherman.com/blog/phack-an-r-function-for-examining-the-effects-of-p-hacking/

	phack <- function(initialN=50, hackrate=10, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=100, alpha=.05, alternative="greater", graph=TRUE, sims=100000) {
	require(psych)
	outmat <- matrix(NA, nrow=sims, ncol=6)
	for(i in 1:sims) {
	grp1 <- rnorm(initialN, grp1M, grp1SD)
	grp2 <- rnorm(initialN, grp2M, grp2SD)
	initial.r <- cor(c(grp1,grp2), c(rep(1,length(grp1)),rep(0,length(grp2))))
	hackcount <- 0
	initialp <- t.test(grp1, grp2, alternative=alternative)$p.value
	currp <- initialp
	while(currp > alpha & maxN > length(grp1)) {
	hackcount <- hackcount + 1
	grp1 <- c(grp1, rnorm(hackrate, grp1M, grp1SD))
	grp2 <- c(grp2, rnorm(hackrate, grp2M, grp2SD))
	currp <- t.test(grp1, grp2, alternative=alternative)$p.value
	}
	Nadded <- length(grp1) - initialN
	est.r <- cor(c(grp1, grp2), c(rep(1, length(grp1)),rep(0, length(grp2))))
	outmat[i,] <- c(initialp, hackcount, currp, Nadded, initial.r, est.r)
	}
	outmat <- data.frame(outmat)
	colnames(outmat) <- c("Initial.p", "Hackcount", "Final.p", "NAdded", "Initial.r", "Final.r")
	InitialSigProb <- sum(outmat$Initial.p <= alpha) / sims
	AvgHacks <- mean(outmat$Hackcount)
	FinalSigProb <- sum(outmat$Final.p <= alpha) / sims
	AvgNAdded <- mean(outmat$NAdded)
	AvgTotN <- mean(initialN + outmat$NAdded)
	StopProb <- sum((initialN + outmat$NAdded)==maxN) / sims
	cat("Proportion of Original Samples Statistically Significant =", InitialSigProb, "\n")
	cat("Proportion of Samples Statistically Significant After Hacking =", FinalSigProb, "\n")
	cat("Probability of Stopping Before Reaching Significance =", StopProb, "\n")
	cat("Average Number of Hacks Before Significant/Stopping =", AvgHacks, "\n")
	cat("Average N Added Before Significant/Stopping =", AvgNAdded, "\n")
	cat("Average Total N", AvgTotN, "\n")
	cat("Estimated r without hacking", round(fisherz2r(mean(fisherz(outmat$Initial.r))),2), "\n")
	cat("Estimated r with hacking", round(fisherz2r(mean(fisherz(outmat$Final.r))),2), "\n")
	cat("Estimated r with hacking", round(fisherz2r(mean(fisherz(outmat$Final.r[outmat$Final.p<alpha]))),2), "(non-significant results not included)", "\n")
	if(graph==TRUE) {
	op <- par(mfrow=c(2,1), las=1, font.main=1)
	hist(outmat$Initial.p[outmat$Initial.p<alpha], xlab="p-values", main="P-curve for Initial Study",
	sub=paste("Number of Significant Studies = ", InitialSigProb*sims, " (", InitialSigProb, ")", sep=""), col="cyan", freq=FALSE)
	lines(density(outmat$Initial.p[outmat$Initial.p<alpha]))
	hist(outmat$Final.p[outmat$Final.p<alpha], xlab="p-values", main="P-curve for p-Hacked Study",
	sub=paste("Number of Significant Studies = ", FinalSigProb*sims, " (", FinalSigProb, ")", sep=""), col="cyan", freq=FALSE)
	lines(density(outmat$Final.p[outmat$Final.p<alpha]))
	}
	return(outmat)
	}

	#P-hack 100000 studies
	res <- phack(initialN=50, hackrate=10, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=100, alpha=.05, alternative="two.sided", graph=TRUE, sims=100000)

	phack<-(res$Final.p)
	phack<-phack[phack<0.05]
	length(phack)
	hist(phack, main="Collecting 50 participants,\nanalyze data after every participants\nuntil 100 participants are collected", xlab=("Observed p-value"), breaks =10)

	#100000 studies with multiple testing

	nSims <- 100000 #number of simulated experiments
	pmult <-numeric(nSims) #set up empty container for all simulated p-values

	for(i in 1:nSims){ #for each simulated experiment
	x<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
	y1<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
	y2<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
	y3<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
	y4<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
	y5<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
	t<-t.test(x,y1) #perform the t-test
	if(t$p.value<0.05) {
	pmult[i]<-t$p.value #get the p-value and store it
	}
	if(t$p.value>0.05) {
	t<-t.test(x,y2) #perform the second t-test
	if(t$p.value<0.05) {
	pmult[i]<-t$p.value #get the p-value and store it
	}
	}
	if(t$p.value>0.05) {
	t<-t.test(x,y3) #perform the third t-test
	if(t$p.value<0.05) {
	pmult[i]<-t$p.value #get the p-value and store it
	}
	}
	if(t$p.value>0.05) {
	t<-t.test(x,y4) #perform the third t-test
	if(t$p.value<0.05) {
	pmult[i]<-t$p.value #get the p-value and store it
	}
	}
	if(t$p.value>0.05) {
	t<-t.test(x,y5) #perform the fifth t-test
	pmult[i]<-t$p.value #get the p-value (regardless of whether significant and store it
	}
	}
	pmult<-pmult[pmult<0.05]
	length(pmult)
	hist(pmult, main="Testing exp. condition against 1/5 control conditions", xlab=("Observed p-value"), breaks =10)


	#100000 normal studies 50% power
	nSims <- 100000 #number of simulated experiments
	p <-numeric(nSims) #set up empty container for all simulated p-values

	for(i in 1:nSims){ #for each simulated experiment
	x<-rnorm(n = 49, mean = 100, sd = 20) #produce simulated participants
	y<-rnorm(n = 49, mean = 108, sd = 20) #produce simulated participants
	t<-t.test(x,y) #perform the t-test
	p[i]<-t$p.value #get the p-value and store it
	}
	p<-p[p<0.05]



	#plots
	hist(p, main="100000 studies with 50% power\nonly true effects", xlab=("Observed p-value"), breaks =10)
	ptot<-c(p,phack, pmult)
	hist(ptot, main="100000 studies with 50% power and\n200000 studies with p-hacking", xlab=("Observed p-value"), breaks =10)