fdabl/pcurve.R

## pcurve.R
library('pwr') # for power calculations
library('foreign') # for dreadful spss .sav files
library('psych') # required by phack
source('http://rynesherman.com/phack.r')

SIM <- 10000
set.seed(42)
options(warn=-1) # avoid warning by read.spss
############################################
# Simulating p-hacking
############################################

run_phack <- function() {
    # call it later
    res <- phack(initialN=30, hackrate=3, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=120,
                 alpha=0.05, alternative='two.sided', graph=TRUE, sims=3000)
    par(mfrow=c(1,1)) # recover from phack plots
}


############################################
# Distribution of p-values
############################################

testit <- function(d) {
    x <- rnorm(mean=0, sd=1, 100)
    y <- rnorm(mean=d, sd=1, 100)
    t.test(x, y)$p.value
}

simplot <- function(pvalues, effect=TRUE) {
    h <- hist(pvalues)
    h$counts <- h$counts / sum(h$counts)
    title <- sprintf('Distribution of p-values given %s effect', ifelse(effect, 'an', 'no'))
    plot(h, freq=FALSE, col='blue', main=title, xlab='p-values', ylab='percentage')
}


############################################
# Relation between power and size of p-values
############################################
power <- seq(0.1, 0.9, 0.1)

test <- function(power, d=0) {
    n <- pwr.t.test(d=0.43, power=power, sig.level=0.05)$n
    x <- rnorm(mean=0, sd=1, n)
    y <- rnorm(mean=d, sd=1, n)
    t.test(x, y)$p.value
}

pplot <- function(percent, power) {
    barplot(percent, col='blue', xlab='power', ylab='p-values <= 0.02', names.arg=power,
            main='Frequency of p-values <= 0.02 in p-values < 0.05 given H1')
}

##########################################
# P - Curve
# Logic of p-curve:
# Computes the p value of the p value, i.e. what is the probability of obtaining a p value
# as extreme or more as the one observed, given that the p values are uniformly distributed (H0).

# e.g. p = 0.01 => ppvalue of 0.2
# generally: ppvalue = p/0.05
# We use Fisher's method to get a chisq distribution for skew.
#
# Practical Example:
# p-curve analysis on a recent PLoS One paper demonstrating publication bias http://goo.gl/lxAkKx
#################################

# TODO: test against 33% Power [relies on non-central distributions]

fisher <- function(pvalues, right=TRUE) {
    # test for right-skew per default
    # -2 times the sum of the natural log of each of k
    # uniform distributions is distributed X² with df = 2k
    ppvalues <- sapply(pvalues, function(p) p/0.05)

    if (!right)
        ppvalues <- 1 - ppvalues

    df <- 2 * length(ppvalues)
    x <- -2 * sum(sapply(ppvalues, log))
    p <- 1 - pchisq(x, df)
    return(list(p=p, df=df, value=x))
}


curveplot <- function(pvalues) {
    h <- hist(pvalues)
    h$counts <- h$counts / sum(h$counts) # use percentages
    title <- sprintf('Distribution of %i p-values', length(pvalues))
    plot(h, col='blue', xlab='p-values', ylab='Percentage', main=title)
}


############################################
# ####################    Main
############################################
wait <- function(info) {
    info <- sprintf('Press [enter] to %s \n', info)
    readline(prompt=info)
}

# phacking
wait('simulate phacking')
cat('running simulations ... \n')
run_phack()
cat('\n')


# distribution
cat('running simulations (like, all the time) ... \n')
effect <- replicate(SIM, testit(0.4))
noeffect <- replicate(SIM, testit(0))

wait('see the distribution of p-values given no effect')
simplot(noeffect, effect=FALSE)

wait('see the distribution of p-values given an effect')
simplot(effect[effect <= 0.05])


# relation power and p-values
wait('see the relation of power and p-values')
cat('needs more simulations (you know the drill) ... \n\n')

effect <- sapply(power, function(p) replicate(SIM / length(power), test(p, d=0.43)))
signp <- effect[effect <= 0.05]
smallp <- function(col) sum(col <= 0.02) / length(smallp)
percent <- apply(effect, 2, smallp)
pplot(percent, power)


# p-curve analysis
wait('see a p-curve analysis of 347 p-values')

data <- read.spss('plos.sav')
pvalues <- data$p
pvalues <- pvalues[!is.na(pvalues)] # remove NA
pvalues <- pvalues[pvalues <= 0.05]

f <- fisher(pvalues)
info <- sprintf('Heavily right-skewed, with Chi^2(%s) = %.2f, p = %.2f\n', f$df, f$value, f$p)
cat(info)
curveplot(pvalues)
	library('pwr') # for power calculations
	library('foreign') # for dreadful spss .sav files
	library('psych') # required by phack
	source('http://rynesherman.com/phack.r')

	SIM <- 10000
	set.seed(42)
	options(warn=-1) # avoid warning by read.spss
	############################################
	# Simulating p-hacking
	############################################

	run_phack <- function() {
	# call it later
	res <- phack(initialN=30, hackrate=3, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=120,
	alpha=0.05, alternative='two.sided', graph=TRUE, sims=3000)
	par(mfrow=c(1,1)) # recover from phack plots
	}


	############################################
	# Distribution of p-values
	############################################

	testit <- function(d) {
	x <- rnorm(mean=0, sd=1, 100)
	y <- rnorm(mean=d, sd=1, 100)
	t.test(x, y)$p.value
	}

	simplot <- function(pvalues, effect=TRUE) {
	h <- hist(pvalues)
	h$counts <- h$counts / sum(h$counts)
	title <- sprintf('Distribution of p-values given %s effect', ifelse(effect, 'an', 'no'))
	plot(h, freq=FALSE, col='blue', main=title, xlab='p-values', ylab='percentage')
	}


	############################################
	# Relation between power and size of p-values
	############################################
	power <- seq(0.1, 0.9, 0.1)

	test <- function(power, d=0) {
	n <- pwr.t.test(d=0.43, power=power, sig.level=0.05)$n
	x <- rnorm(mean=0, sd=1, n)
	y <- rnorm(mean=d, sd=1, n)
	t.test(x, y)$p.value
	}

	pplot <- function(percent, power) {
	barplot(percent, col='blue', xlab='power', ylab='p-values <= 0.02', names.arg=power,
	main='Frequency of p-values <= 0.02 in p-values < 0.05 given H1')
	}

	##########################################
	# P - Curve
	# Logic of p-curve:
	# Computes the p value of the p value, i.e. what is the probability of obtaining a p value
	# as extreme or more as the one observed, given that the p values are uniformly distributed (H0).

	# e.g. p = 0.01 => ppvalue of 0.2
	# generally: ppvalue = p/0.05
	# We use Fisher's method to get a chisq distribution for skew.
	#
	# Practical Example:
	# p-curve analysis on a recent PLoS One paper demonstrating publication bias http://goo.gl/lxAkKx
	#################################

	# TODO: test against 33% Power [relies on non-central distributions]

	fisher <- function(pvalues, right=TRUE) {
	# test for right-skew per default
	# -2 times the sum of the natural log of each of k
	# uniform distributions is distributed X² with df = 2k
	ppvalues <- sapply(pvalues, function(p) p/0.05)

	if (!right)
	ppvalues <- 1 - ppvalues

	df <- 2 * length(ppvalues)
	x <- -2 * sum(sapply(ppvalues, log))
	p <- 1 - pchisq(x, df)
	return(list(p=p, df=df, value=x))
	}


	curveplot <- function(pvalues) {
	h <- hist(pvalues)
	h$counts <- h$counts / sum(h$counts) # use percentages
	title <- sprintf('Distribution of %i p-values', length(pvalues))
	plot(h, col='blue', xlab='p-values', ylab='Percentage', main=title)
	}


	############################################
	# #################### Main
	############################################
	wait <- function(info) {
	info <- sprintf('Press [enter] to %s \n', info)
	readline(prompt=info)
	}

	# phacking
	wait('simulate phacking')
	cat('running simulations ... \n')
	run_phack()
	cat('\n')


	# distribution
	cat('running simulations (like, all the time) ... \n')
	effect <- replicate(SIM, testit(0.4))
	noeffect <- replicate(SIM, testit(0))

	wait('see the distribution of p-values given no effect')
	simplot(noeffect, effect=FALSE)

	wait('see the distribution of p-values given an effect')
	simplot(effect[effect <= 0.05])


	# relation power and p-values
	wait('see the relation of power and p-values')
	cat('needs more simulations (you know the drill) ... \n\n')

	effect <- sapply(power, function(p) replicate(SIM / length(power), test(p, d=0.43)))
	signp <- effect[effect <= 0.05]
	smallp <- function(col) sum(col <= 0.02) / length(smallp)
	percent <- apply(effect, 2, smallp)
	pplot(percent, power)


	# p-curve analysis
	wait('see a p-curve analysis of 347 p-values')

	data <- read.spss('plos.sav')
	pvalues <- data$p
	pvalues <- pvalues[!is.na(pvalues)] # remove NA
	pvalues <- pvalues[pvalues <= 0.05]

	f <- fisher(pvalues)
	info <- sprintf('Heavily right-skewed, with Chi^2(%s) = %.2f, p = %.2f\n', f$df, f$value, f$p)
	cat(info)
	curveplot(pvalues)