Skip to content

Instantly share code, notes, and snippets.

@Lakens
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Lakens/7f516efbcc47de9d40a0 to your computer and use it in GitHub Desktop.
Save Lakens/7f516efbcc47de9d40a0 to your computer and use it in GitHub Desktop.
PcurveTrueAndPhackedStudiesFig1_Fig2
# phack
# Requires
# {psych}
# See Also http://rynesherman.com/blog/phack-an-r-function-for-examining-the-effects-of-p-hacking/
phack <- function(initialN=50, hackrate=10, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=100, alpha=.05, alternative="greater", graph=TRUE, sims=100000) {
require(psych)
outmat <- matrix(NA, nrow=sims, ncol=6)
for(i in 1:sims) {
grp1 <- rnorm(initialN, grp1M, grp1SD)
grp2 <- rnorm(initialN, grp2M, grp2SD)
initial.r <- cor(c(grp1,grp2), c(rep(1,length(grp1)),rep(0,length(grp2))))
hackcount <- 0
initialp <- t.test(grp1, grp2, alternative=alternative)$p.value
currp <- initialp
while(currp > alpha & maxN > length(grp1)) {
hackcount <- hackcount + 1
grp1 <- c(grp1, rnorm(hackrate, grp1M, grp1SD))
grp2 <- c(grp2, rnorm(hackrate, grp2M, grp2SD))
currp <- t.test(grp1, grp2, alternative=alternative)$p.value
}
Nadded <- length(grp1) - initialN
est.r <- cor(c(grp1, grp2), c(rep(1, length(grp1)),rep(0, length(grp2))))
outmat[i,] <- c(initialp, hackcount, currp, Nadded, initial.r, est.r)
}
outmat <- data.frame(outmat)
colnames(outmat) <- c("Initial.p", "Hackcount", "Final.p", "NAdded", "Initial.r", "Final.r")
InitialSigProb <- sum(outmat$Initial.p <= alpha) / sims
AvgHacks <- mean(outmat$Hackcount)
FinalSigProb <- sum(outmat$Final.p <= alpha) / sims
AvgNAdded <- mean(outmat$NAdded)
AvgTotN <- mean(initialN + outmat$NAdded)
StopProb <- sum((initialN + outmat$NAdded)==maxN) / sims
cat("Proportion of Original Samples Statistically Significant =", InitialSigProb, "\n")
cat("Proportion of Samples Statistically Significant After Hacking =", FinalSigProb, "\n")
cat("Probability of Stopping Before Reaching Significance =", StopProb, "\n")
cat("Average Number of Hacks Before Significant/Stopping =", AvgHacks, "\n")
cat("Average N Added Before Significant/Stopping =", AvgNAdded, "\n")
cat("Average Total N", AvgTotN, "\n")
cat("Estimated r without hacking", round(fisherz2r(mean(fisherz(outmat$Initial.r))),2), "\n")
cat("Estimated r with hacking", round(fisherz2r(mean(fisherz(outmat$Final.r))),2), "\n")
cat("Estimated r with hacking", round(fisherz2r(mean(fisherz(outmat$Final.r[outmat$Final.p<alpha]))),2), "(non-significant results not included)", "\n")
if(graph==TRUE) {
op <- par(mfrow=c(2,1), las=1, font.main=1)
hist(outmat$Initial.p[outmat$Initial.p<alpha], xlab="p-values", main="P-curve for Initial Study",
sub=paste("Number of Significant Studies = ", InitialSigProb*sims, " (", InitialSigProb, ")", sep=""), col="cyan", freq=FALSE)
lines(density(outmat$Initial.p[outmat$Initial.p<alpha]))
hist(outmat$Final.p[outmat$Final.p<alpha], xlab="p-values", main="P-curve for p-Hacked Study",
sub=paste("Number of Significant Studies = ", FinalSigProb*sims, " (", FinalSigProb, ")", sep=""), col="cyan", freq=FALSE)
lines(density(outmat$Final.p[outmat$Final.p<alpha]))
}
return(outmat)
}
#P-hack 100000 studies
res <- phack(initialN=50, hackrate=10, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=100, alpha=.05, alternative="two.sided", graph=TRUE, sims=100000)
phack<-(res$Final.p)
phack<-phack[phack<0.05]
length(phack)
hist(phack, main="Collecting 50 participants,\nanalyze data after every participants\nuntil 100 participants are collected", xlab=("Observed p-value"), breaks =10)
#100000 studies with multiple testing
nSims <- 100000 #number of simulated experiments
pmult <-numeric(nSims) #set up empty container for all simulated p-values
for(i in 1:nSims){ #for each simulated experiment
x<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
y1<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
y2<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
y3<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
y4<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
y5<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants
t<-t.test(x,y1) #perform the t-test
if(t$p.value<0.05) {
pmult[i]<-t$p.value #get the p-value and store it
}
if(t$p.value>0.05) {
t<-t.test(x,y2) #perform the second t-test
if(t$p.value<0.05) {
pmult[i]<-t$p.value #get the p-value and store it
}
}
if(t$p.value>0.05) {
t<-t.test(x,y3) #perform the third t-test
if(t$p.value<0.05) {
pmult[i]<-t$p.value #get the p-value and store it
}
}
if(t$p.value>0.05) {
t<-t.test(x,y4) #perform the third t-test
if(t$p.value<0.05) {
pmult[i]<-t$p.value #get the p-value and store it
}
}
if(t$p.value>0.05) {
t<-t.test(x,y5) #perform the fifth t-test
pmult[i]<-t$p.value #get the p-value (regardless of whether significant and store it
}
}
pmult<-pmult[pmult<0.05]
length(pmult)
hist(pmult, main="Testing exp. condition against 1/5 control conditions", xlab=("Observed p-value"), breaks =10)
#100000 normal studies 50% power
nSims <- 100000 #number of simulated experiments
p <-numeric(nSims) #set up empty container for all simulated p-values
for(i in 1:nSims){ #for each simulated experiment
x<-rnorm(n = 49, mean = 100, sd = 20) #produce simulated participants
y<-rnorm(n = 49, mean = 108, sd = 20) #produce simulated participants
t<-t.test(x,y) #perform the t-test
p[i]<-t$p.value #get the p-value and store it
}
p<-p[p<0.05]
#plots
hist(p, main="100000 studies with 50% power\nonly true effects", xlab=("Observed p-value"), breaks =10)
ptot<-c(p,phack, pmult)
hist(ptot, main="100000 studies with 50% power and\n200000 studies with p-hacking", xlab=("Observed p-value"), breaks =10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment