adelavega/Boostrapping and Permuation testing

## Boostrapping and Permuation testing
##############################################################
##### Bootstrap of group difference                      #####
##############################################################

### First, we generate two samples (N = 25) with a difference of 0.5
### Second, test using parametric t.test
### Then we define a function to test the difference between the two samples
### using indices that boot will provide

## Then we run boot. Importantly, define the strata of your group labels
## so boot w/ resample appropriately (only within group w/ replacement)

## Then we run it w/ only 2 resamples to investigate (turn print indices on)

## Then we do it for real w/ 10000 resamples, investigate the bias of our staticis
## get a 95% confidence interval, and plot the density of our bootstrapped statistic

## If our bootstrapped difference does not include zero, then its significant

library(boot)

# make up data
dat <- stack(list(a = rnorm(25, mean=10), b = rnorm(25, mean=10.5)))
t.test(values ~ ind, data=dat, var.equal=T)

# define bootstrap statistic
# comment out print statements before bootstrapping
bootTwoSampleDiff <- function(data, indices){
  newdata <- data[indices,]
  # print(indices)
  return(diff = mean(subset(dat, ind==levels(dat$ind)[1])$value) - mean(subset(dat, ind==levels(dat$ind)[2])$value))
}

# examine indices
boot(data=dat, statistic=bootTwoSampleDiff, R=2,  strata=dat$ind)

# bootstrap!
resamples <- 10000
results1 <- boot(data=dat, statistic=bootTwoSampleDiff, R=resamples, strata=strata=dat$ind)
results1
boot.ci(results1)
plot(density(results1$t))

##############################################################
##### Permutation test                                   #####
##############################################################

### Here instead of bootstrapping the statistics we do a permutation test
### to generate the distribution of the null hypothesis
### Similar to above but instead of will compare our real estimate
### to the null distribution we generated to see what the probability
### is that our estimate came from the null (two-tailed p-value)

str(dat)

# define permutation test statistic
# comment out print statements before permutating
permDiff <- function(data, indices){
#   print(indices)
#   print(length(unique(indices)))
  indices <- factor(indices <= 25, labels=c("b","a"))
  newdata <- cbind.data.frame(indices, values=data$values)
  return(diff = mean(newdata$values[newdata$indices=="b"]) - mean(newdata$values[newdata$indices=="a"]))
}

# examine indices
boot(data=dat, statistic=permDiff, R=2, sim="permutation")

# permutate!
resamples <- 10000
results2 <- boot(data=dat, statistic=permDiff, R=resamples, sim="permutation")
plot(density(results2$t))

# calculate real estimate
est <- mean(dat$values[dat$ind=="b"]) - mean(dat$values[dat$ind=="a"])

# what is the probability our estimate came from the null distribution?
# two-tailed p-value
p.value <- mean(results2$t > est) + mean(results2$t < -1*est)
p.value
	##############################################################
	##### Bootstrap of group difference #####
	##############################################################

	### First, we generate two samples (N = 25) with a difference of 0.5
	### Second, test using parametric t.test
	### Then we define a function to test the difference between the two samples
	### using indices that boot will provide

	## Then we run boot. Importantly, define the strata of your group labels
	## so boot w/ resample appropriately (only within group w/ replacement)

	## Then we run it w/ only 2 resamples to investigate (turn print indices on)

	## Then we do it for real w/ 10000 resamples, investigate the bias of our staticis
	## get a 95% confidence interval, and plot the density of our bootstrapped statistic

	## If our bootstrapped difference does not include zero, then its significant

	library(boot)

	# make up data
	dat <- stack(list(a = rnorm(25, mean=10), b = rnorm(25, mean=10.5)))
	t.test(values ~ ind, data=dat, var.equal=T)

	# define bootstrap statistic
	# comment out print statements before bootstrapping
	bootTwoSampleDiff <- function(data, indices){
	newdata <- data[indices,]
	# print(indices)
	return(diff = mean(subset(dat, ind==levels(dat$ind)[1])$value) - mean(subset(dat, ind==levels(dat$ind)[2])$value))
	}

	# examine indices
	boot(data=dat, statistic=bootTwoSampleDiff, R=2, strata=dat$ind)

	# bootstrap!
	resamples <- 10000
	results1 <- boot(data=dat, statistic=bootTwoSampleDiff, R=resamples, strata=strata=dat$ind)
	results1
	boot.ci(results1)
	plot(density(results1$t))

	##############################################################
	##### Permutation test #####
	##############################################################

	### Here instead of bootstrapping the statistics we do a permutation test
	### to generate the distribution of the null hypothesis
	### Similar to above but instead of will compare our real estimate
	### to the null distribution we generated to see what the probability
	### is that our estimate came from the null (two-tailed p-value)

	str(dat)

	# define permutation test statistic
	# comment out print statements before permutating
	permDiff <- function(data, indices){
	# print(indices)
	# print(length(unique(indices)))
	indices <- factor(indices <= 25, labels=c("b","a"))
	newdata <- cbind.data.frame(indices, values=data$values)
	return(diff = mean(newdata$values[newdata$indices=="b"]) - mean(newdata$values[newdata$indices=="a"]))
	}

	# examine indices
	boot(data=dat, statistic=permDiff, R=2, sim="permutation")

	# permutate!
	resamples <- 10000
	results2 <- boot(data=dat, statistic=permDiff, R=resamples, sim="permutation")
	plot(density(results2$t))

	# calculate real estimate
	est <- mean(dat$values[dat$ind=="b"]) - mean(dat$values[dat$ind=="a"])

	# what is the probability our estimate came from the null distribution?
	# two-tailed p-value
	p.value <- mean(results2$t > est) + mean(results2$t < -1*est)
	p.value