squito/binomial_confidence.R

## binomial_confidence.R

alpha = 0.8
z <- 1.28

normal.theta.conf <- function(x, n) {
  # normal approximation to a binomial.  Generally agreed to be horrible
  phat <- x / n
  t <- z * sqrt(phat * (1 - phat)) / sqrt(n)
  c(max(0,phat - t), min(1, phat + t))
}

cp.theta.conf <- function(x,n) {
  # clopper pearson interval.  guaranteed coverage, but maybe too broad
  binom.test(x,n, conf.level=alpha)$conf.int
}

theta.covered <- function(p, conf.int) {
  all(p > conf.int[1], p < conf.int[2])
}

theta.coverage.test <- function(p, n, T, estimators) {
  data = rbinom(T,n,p)
  coverage = sapply(data, function(x){
    sapply(estimators, function(est){
      conf.int <- est(x,n)
      covered <- theta.covered(p, conf.int)
      conf.length <- conf.int[2] - conf.int[1]
      c(covered, conf.length)
    })
  })
  r = cbind(data, t(coverage))
  colnames(r) = c("x", as.character(sapply(names(estimators), function(x){paste(x, c(".covered", ".length"), sep="")})))


  cs = colSums(r)

  results = list()
  for (i in 1:length(estimators)) {
     obs.coverage = cs[i * 2] / T
     avg.length = cs[i * 2 + 1] / T
     t = c(obs.coverage, avg.length)
     names(t) = c("coverage", "avg length")
     results[[names(estimators)[i]]] = t
  }
  results
}


theta.estimators = c(normal.theta.conf, cp.theta.conf)
names(theta.estimators) = c("normal approx", "clopper pearson")


x <- seq(0.01, 0.1, 0.01)
low.p.coverage <- sapply(x, function(x){sapply(theta.coverage.test(x, 40, 1000, theta.estimators), function(y){y["coverage"]})})

plot(x, low.p.coverage[1,],'l', ylim=c(0,1), xlab="true p", ylab="coverage")
lines(x,low.p.coverage[2,], lty=2)
abline(h=0.8, lty=2)
legend("bottomright", c("normal approx", "clopper pearson"), lty=c(1,2))


normal.delta.conf <- function(x_1, n_1, x_2, n_2){
  # Estimate theta_1 and theta_2
  hat_theta_1 = x_1/n_1
  hat_theta_2 = x_2/n_2

  # Estimate \delta
  hat_delta = hat_theta_1 - hat_theta_2

  # Estimate the standard deviation of hat_delta
  hat_std_hat_delta = sqrt(hat_theta_1 * (1-hat_theta_1)/n_1 + hat_theta_2 * (1-hat_theta_2)/n_2)
  c(hat_delta - z * hat_std_hat_delta, hat_delta + z * hat_std_hat_delta)
}


ag.delta.conf <- function(x1,n1, x2, n2) {
  normal.delta.conf(x1 + 1, n1 + 2, x2 + 1, n2 + 2)
}

delta.estimators <- list(normal.delta.conf, ag.delta.conf)
names(delta.estimators) <- c("normal approx", "+1 smoothing")


delta.coverage.test <- function(p1, n1, p2,n2, T, estimators) {
  data1 = rbinom(T,n1,p1)
  data2 = rbinom(T,n2,p2)
  data = cbind(data1,data2)
  true.delta = p1 - p2
  coverage = sapply(1:T, function(i){
    x1 = data[i,1]
    x2 = data[i,2]
    sapply(estimators, function(est){
      conf.int <- est(x1,n1,x2,n2)
      covered <- theta.covered(true.delta, conf.int)
      conf.length <- conf.int[2] - conf.int[1]
      c(covered, conf.length)
    })
  })
  r = cbind(data, t(coverage))
  colnames(r) = c("x1","x2", as.character(sapply(names(estimators), function(x){paste(x, c(".covered", ".length"), sep="")})))


  cs = colSums(r)

  results = list()
  for (i in 1:length(estimators)) {
    obs.coverage = cs[i * 2 + 1] / T
    avg.length = cs[i * 2 + 2] / T
    t = c(obs.coverage, avg.length)
    names(t) = c("coverage", "avg length")
    results[[names(estimators)[i]]] = t
  }
  results
}


x <- seq(10,100,10)
low.n.delta.coverage <- sapply(x, function(n1) {
  sapply(delta.coverage.test(0.02, n1, 0.02, 2249, 1000, delta.estimators),
         function(y) {y["coverage"]}
  )
})


plot(x, low.n.delta.coverage[1,], 'l', ylim=c(0,1), xlab="n1", ylab="coverage",
     main="Conf Interval coverage for p1 - p2",
     sub="p1 = p2 = 0.02, n2 = 2249")
lines(x, low.n.delta.coverage[2,], lty=2)
abline(h=0.8, lty=3)
legend("bottomright", names(delta.estimators), lty=c(1,2))

	alpha = 0.8
	z <- 1.28

	normal.theta.conf <- function(x, n) {
	# normal approximation to a binomial. Generally agreed to be horrible
	phat <- x / n
	t <- z * sqrt(phat * (1 - phat)) / sqrt(n)
	c(max(0,phat - t), min(1, phat + t))
	}

	cp.theta.conf <- function(x,n) {
	# clopper pearson interval. guaranteed coverage, but maybe too broad
	binom.test(x,n, conf.level=alpha)$conf.int
	}

	theta.covered <- function(p, conf.int) {
	all(p > conf.int[1], p < conf.int[2])
	}

	theta.coverage.test <- function(p, n, T, estimators) {
	data = rbinom(T,n,p)
	coverage = sapply(data, function(x){
	sapply(estimators, function(est){
	conf.int <- est(x,n)
	covered <- theta.covered(p, conf.int)
	conf.length <- conf.int[2] - conf.int[1]
	c(covered, conf.length)
	})
	})
	r = cbind(data, t(coverage))
	colnames(r) = c("x", as.character(sapply(names(estimators), function(x){paste(x, c(".covered", ".length"), sep="")})))


	cs = colSums(r)

	results = list()
	for (i in 1:length(estimators)) {
	obs.coverage = cs[i * 2] / T
	avg.length = cs[i * 2 + 1] / T
	t = c(obs.coverage, avg.length)
	names(t) = c("coverage", "avg length")
	results[[names(estimators)[i]]] = t
	}
	results
	}


	theta.estimators = c(normal.theta.conf, cp.theta.conf)
	names(theta.estimators) = c("normal approx", "clopper pearson")





	x <- seq(0.01, 0.1, 0.01)
	low.p.coverage <- sapply(x, function(x){sapply(theta.coverage.test(x, 40, 1000, theta.estimators), function(y){y["coverage"]})})

	plot(x, low.p.coverage[1,],'l', ylim=c(0,1), xlab="true p", ylab="coverage")
	lines(x,low.p.coverage[2,], lty=2)
	abline(h=0.8, lty=2)
	legend("bottomright", c("normal approx", "clopper pearson"), lty=c(1,2))






	normal.delta.conf <- function(x_1, n_1, x_2, n_2){
	# Estimate theta_1 and theta_2
	hat_theta_1 = x_1/n_1
	hat_theta_2 = x_2/n_2

	# Estimate \delta
	hat_delta = hat_theta_1 - hat_theta_2

	# Estimate the standard deviation of hat_delta
	hat_std_hat_delta = sqrt(hat_theta_1 * (1-hat_theta_1)/n_1 + hat_theta_2 * (1-hat_theta_2)/n_2)
	c(hat_delta - z * hat_std_hat_delta, hat_delta + z * hat_std_hat_delta)
	}


	ag.delta.conf <- function(x1,n1, x2, n2) {
	normal.delta.conf(x1 + 1, n1 + 2, x2 + 1, n2 + 2)
	}

	delta.estimators <- list(normal.delta.conf, ag.delta.conf)
	names(delta.estimators) <- c("normal approx", "+1 smoothing")


	delta.coverage.test <- function(p1, n1, p2,n2, T, estimators) {
	data1 = rbinom(T,n1,p1)
	data2 = rbinom(T,n2,p2)
	data = cbind(data1,data2)
	true.delta = p1 - p2
	coverage = sapply(1:T, function(i){
	x1 = data[i,1]
	x2 = data[i,2]
	sapply(estimators, function(est){
	conf.int <- est(x1,n1,x2,n2)
	covered <- theta.covered(true.delta, conf.int)
	conf.length <- conf.int[2] - conf.int[1]
	c(covered, conf.length)
	})
	})
	r = cbind(data, t(coverage))
	colnames(r) = c("x1","x2", as.character(sapply(names(estimators), function(x){paste(x, c(".covered", ".length"), sep="")})))


	cs = colSums(r)

	results = list()
	for (i in 1:length(estimators)) {
	obs.coverage = cs[i * 2 + 1] / T
	avg.length = cs[i * 2 + 2] / T
	t = c(obs.coverage, avg.length)
	names(t) = c("coverage", "avg length")
	results[[names(estimators)[i]]] = t
	}
	results
	}



	x <- seq(10,100,10)
	low.n.delta.coverage <- sapply(x, function(n1) {
	sapply(delta.coverage.test(0.02, n1, 0.02, 2249, 1000, delta.estimators),
	function(y) {y["coverage"]}
	)
	})


	plot(x, low.n.delta.coverage[1,], 'l', ylim=c(0,1), xlab="n1", ylab="coverage",
	main="Conf Interval coverage for p1 - p2",
	sub="p1 = p2 = 0.02, n2 = 2249")
	lines(x, low.n.delta.coverage[2,], lty=2)
	abline(h=0.8, lty=3)
	legend("bottomright", names(delta.estimators), lty=c(1,2))