Sleepingwell/gist:5076345

## gistfile1.r
pre <- scan()
51.11
64.44
73.33
77.78
66.67
84.44
68.89
57.78
64.44
60.00
64.44
88.89
75.56
66.67
86.67
77.78
75.56
91.11

post <- scan()
82.22
84.44
53.33
77.78
77.78
91.11
68.89
62.22
64.44
42.22
60.00
80.00
77.78
68.89
80.00
80.00
91.11
95.56

# function that calculates the bootstrap, the Mann-Whitney-Wilcoxon, permutation and T tests
# args:
#   pre:   The pre-test scores
#   post:  The post-test scores
#   R:     The number of replicates for the bootstrap and permutation tests
#
# Note that there are many MANY possible refinements of the bootstrap we could use which
# may be more powerful that the ones used here. I have just used a couple to demonstrate
# the ideas.
#
# The other approach to using the bootstrap for hypothesis testing is to calculate a
# confidence interval for the test statistic and see whether it contains zero, which I have
# not pursued here as I don't find it as appealing (and because of the duality between
# CIs and confidence intervals, the results are usually very similar - if not identical - for
# some method of constructing an interval).
demo.all.methods <- function(pre, post, R) {
    # bootstrap resamples
    stat <- mean(post) - mean(pre)
    post.under.null <- post - stat
    boots <- sapply(1:R, function(...) mean(sample(post.under.null, replace=T)) - mean(sample(pre, replace=T)))

    # permutations
    all.res <- c(pre, post)
    perm.sampler <- function(..., replace) {
        perm <- sample(all.res, replace=replace)
        mean(rev(perm)[1:length(post)]) - mean(perm[1:length(pre)])
    }

    perms <- sapply(1:R, perm.sampler, replace=F)
    perm.boots <- sapply(1:R, perm.sampler, replace=T)

    # data for the Wilcox and T tests.
    dat <- cbind(res=c(pre, post), group=rep(0:1, c(length(pre), length(post))))

    c(
        bootstrap=sum(boots-stat > 0) / R, # bootstrap assuming different pre and post distributions, sampling under the null
        bootstrap2=sum((boots+stat) <= 0) / R, # bootstrap assuming different pre and post distributions, sampling directly from each
        permutation=sum(perms > stat) / R, # a permutation test
        bootstrap.permutation=sum(perm.boots > stat) / R, # a permutation test using the bootstrap - imples the same pre and post distributions under the null
        wilcox=wilcox.test(res~group, data=dat, alternative="less")$p.value, # Mann–Whitney–Wilcoxon
        t.test.not.equal.var=t.test(res~group, data=dat, alternative="less")$p.value, # T-test (unequal variances)
        t.test.equal.var=t.test(res~group, data=dat, alternative="less", var.equal=T)$p.value # T-test (equal variances)
    )
}


# function for comparing the power of the various methods for the given data. Works by setting the mean
# of the post-test scores equal to the mean of the pre-test scores and using the bootstrap to determine
# the proportion of times that a difference is detected, then progressively increasing the mean of the
# post-test scores and repeating.
# args:
#   pre:     The pre-test scores.
#   post:    The post-test scores.
#   difs:    Vector of proportions of the mean of the pre-test score to add to the post-test scores.
#   alpha:   The one-sided significance level of the test.
#   R:       The nubmer of boostrap replications to use in for the bootstrap and permutation tests.
#   R.tests: The number of boostrap replications to use in for the power the calculations.
power.test <- function(pre, post, difs, alpha, R, R.power) {
    mean.pre <- mean(pre)
    mean.dif <- mean(post)-mean.pre
    post <- post-mean.dif # set the means to be equal
    tmp <- as.data.frame(sapply(difs, function(dif) {
        post <- post + dif*mean.pre
        reps <- t(sapply(1:R.power, function(...) demo.all.methods(sample(pre, replace=T), sample(post, replace=T), R)))
        apply(reps<alpha, 2, sum) / R.power
    }))
    names(tmp) <- paste('p', difs, sep='')
    tmp
}


# number of replicates to use in evaluation tests
R.power <- 1000
# number of replicates to use in the bootstrap and permutation methods.
R <- 1000
# significance level to use in significance tests in power calculations
alpha <- 0.05
# proportions of pre-test means to add to post-test data in power calculations
difs <- c(seq(0, 0.1, 0.01), seq(0.15, 0.25, 0.025))
# hopefully we'll all get the same results.
set.seed(42)
# wilcox is noisy because we have replicates - tell it to shut up
options(warn=-1)

res <- demo.all.methods(pre, post, R)
powers <- power.test(pre, post, difs, alpha, R, R.power)

cat("p-values:\n")
print(res)

cat("bootstrap power calculations\n")
print(powers)

#-----------------------------------------------------------------------------------------------------------------------
# Results
#-----------------------------------------------------------------------------------------------------------------------
#p-values:
#bootstrap             0.3160000
#bootstrap2            0.2680000
#permutation           0.2770000
#bootstrap.permutation 0.2700000
#wilcox                0.1871320
#t.test.not.equal.var  0.2907173
#t.test.equal.var      0.2906348


#Power calculations (using bootstrap):
#post-mean % of pre-mean    100   101   102   103   104   105   106   107   108   109   110   115   117.5  120    122.5 125
#bootstrap                  0.062 0.091 0.122 0.158 0.223 0.268 0.292 0.383 0.448 0.493 0.597 0.836 0.922  0.974  0.991 0.992
#bootstrap2                 0.059 0.089 0.114 0.150 0.214 0.253 0.280 0.375 0.421 0.485 0.588 0.833 0.916  0.976  0.990 0.993
#permutation                0.051 0.077 0.100 0.139 0.203 0.237 0.259 0.353 0.406 0.460 0.553 0.807 0.902  0.966  0.985 0.990
#bootstrap.permutation      0.054 0.087 0.111 0.143 0.205 0.238 0.269 0.365 0.415 0.466 0.565 0.825 0.905  0.966  0.984 0.991
#wilcox                     0.053 0.158 0.157 0.163 0.331 0.329 0.284 0.391 0.395 0.378 0.582 0.773 0.894  0.970  0.988 0.997
#t.test.not.equal.var       0.052 0.080 0.106 0.137 0.204 0.235 0.258 0.355 0.410 0.457 0.554 0.810 0.907  0.965  0.986 0.991
#t.test.equal.var           0.052 0.080 0.106 0.138 0.204 0.235 0.258 0.355 0.412 0.459 0.556 0.811 0.907  0.966  0.986 0.991
	pre <- scan()
	51.11
	64.44
	73.33
	77.78
	66.67
	84.44
	68.89
	57.78
	64.44
	60.00
	64.44
	88.89
	75.56
	66.67
	86.67
	77.78
	75.56
	91.11

	post <- scan()
	82.22
	84.44
	53.33
	77.78
	77.78
	91.11
	68.89
	62.22
	64.44
	42.22
	60.00
	80.00
	77.78
	68.89
	80.00
	80.00
	91.11
	95.56

	# function that calculates the bootstrap, the Mann-Whitney-Wilcoxon, permutation and T tests
	# args:
	# pre: The pre-test scores
	# post: The post-test scores
	# R: The number of replicates for the bootstrap and permutation tests
	#
	# Note that there are many MANY possible refinements of the bootstrap we could use which
	# may be more powerful that the ones used here. I have just used a couple to demonstrate
	# the ideas.
	#
	# The other approach to using the bootstrap for hypothesis testing is to calculate a
	# confidence interval for the test statistic and see whether it contains zero, which I have
	# not pursued here as I don't find it as appealing (and because of the duality between
	# CIs and confidence intervals, the results are usually very similar - if not identical - for
	# some method of constructing an interval).
	demo.all.methods <- function(pre, post, R) {
	# bootstrap resamples
	stat <- mean(post) - mean(pre)
	post.under.null <- post - stat
	boots <- sapply(1:R, function(...) mean(sample(post.under.null, replace=T)) - mean(sample(pre, replace=T)))

	# permutations
	all.res <- c(pre, post)
	perm.sampler <- function(..., replace) {
	perm <- sample(all.res, replace=replace)
	mean(rev(perm)[1:length(post)]) - mean(perm[1:length(pre)])
	}

	perms <- sapply(1:R, perm.sampler, replace=F)
	perm.boots <- sapply(1:R, perm.sampler, replace=T)

	# data for the Wilcox and T tests.
	dat <- cbind(res=c(pre, post), group=rep(0:1, c(length(pre), length(post))))

	c(
	bootstrap=sum(boots-stat > 0) / R, # bootstrap assuming different pre and post distributions, sampling under the null
	bootstrap2=sum((boots+stat) <= 0) / R, # bootstrap assuming different pre and post distributions, sampling directly from each
	permutation=sum(perms > stat) / R, # a permutation test
	bootstrap.permutation=sum(perm.boots > stat) / R, # a permutation test using the bootstrap - imples the same pre and post distributions under the null
	wilcox=wilcox.test(res~group, data=dat, alternative="less")$p.value, # Mann–Whitney–Wilcoxon
	t.test.not.equal.var=t.test(res~group, data=dat, alternative="less")$p.value, # T-test (unequal variances)
	t.test.equal.var=t.test(res~group, data=dat, alternative="less", var.equal=T)$p.value # T-test (equal variances)
	)
	}



	# function for comparing the power of the various methods for the given data. Works by setting the mean
	# of the post-test scores equal to the mean of the pre-test scores and using the bootstrap to determine
	# the proportion of times that a difference is detected, then progressively increasing the mean of the
	# post-test scores and repeating.
	# args:
	# pre: The pre-test scores.
	# post: The post-test scores.
	# difs: Vector of proportions of the mean of the pre-test score to add to the post-test scores.
	# alpha: The one-sided significance level of the test.
	# R: The nubmer of boostrap replications to use in for the bootstrap and permutation tests.
	# R.tests: The number of boostrap replications to use in for the power the calculations.
	power.test <- function(pre, post, difs, alpha, R, R.power) {
	mean.pre <- mean(pre)
	mean.dif <- mean(post)-mean.pre
	post <- post-mean.dif # set the means to be equal
	tmp <- as.data.frame(sapply(difs, function(dif) {
	post <- post + dif*mean.pre
	reps <- t(sapply(1:R.power, function(...) demo.all.methods(sample(pre, replace=T), sample(post, replace=T), R)))
	apply(reps<alpha, 2, sum) / R.power
	}))
	names(tmp) <- paste('p', difs, sep='')
	tmp
	}



	# number of replicates to use in evaluation tests
	R.power <- 1000
	# number of replicates to use in the bootstrap and permutation methods.
	R <- 1000
	# significance level to use in significance tests in power calculations
	alpha <- 0.05
	# proportions of pre-test means to add to post-test data in power calculations
	difs <- c(seq(0, 0.1, 0.01), seq(0.15, 0.25, 0.025))
	# hopefully we'll all get the same results.
	set.seed(42)
	# wilcox is noisy because we have replicates - tell it to shut up
	options(warn=-1)

	res <- demo.all.methods(pre, post, R)
	powers <- power.test(pre, post, difs, alpha, R, R.power)

	cat("p-values:\n")
	print(res)

	cat("bootstrap power calculations\n")
	print(powers)

	#-----------------------------------------------------------------------------------------------------------------------
	# Results
	#-----------------------------------------------------------------------------------------------------------------------
	#p-values:
	#bootstrap 0.3160000
	#bootstrap2 0.2680000
	#permutation 0.2770000
	#bootstrap.permutation 0.2700000
	#wilcox 0.1871320
	#t.test.not.equal.var 0.2907173
	#t.test.equal.var 0.2906348



	#Power calculations (using bootstrap):
	#post-mean % of pre-mean 100 101 102 103 104 105 106 107 108 109 110 115 117.5 120 122.5 125
	#bootstrap 0.062 0.091 0.122 0.158 0.223 0.268 0.292 0.383 0.448 0.493 0.597 0.836 0.922 0.974 0.991 0.992
	#bootstrap2 0.059 0.089 0.114 0.150 0.214 0.253 0.280 0.375 0.421 0.485 0.588 0.833 0.916 0.976 0.990 0.993
	#permutation 0.051 0.077 0.100 0.139 0.203 0.237 0.259 0.353 0.406 0.460 0.553 0.807 0.902 0.966 0.985 0.990
	#bootstrap.permutation 0.054 0.087 0.111 0.143 0.205 0.238 0.269 0.365 0.415 0.466 0.565 0.825 0.905 0.966 0.984 0.991
	#wilcox 0.053 0.158 0.157 0.163 0.331 0.329 0.284 0.391 0.395 0.378 0.582 0.773 0.894 0.970 0.988 0.997
	#t.test.not.equal.var 0.052 0.080 0.106 0.137 0.204 0.235 0.258 0.355 0.410 0.457 0.554 0.810 0.907 0.965 0.986 0.991
	#t.test.equal.var 0.052 0.080 0.106 0.138 0.204 0.235 0.258 0.355 0.412 0.459 0.556 0.811 0.907 0.966 0.986 0.991