Skip to content

Instantly share code, notes, and snippets.

@squito
Created June 17, 2014 14:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save squito/1558c80d29fea1360bcd to your computer and use it in GitHub Desktop.
Save squito/1558c80d29fea1360bcd to your computer and use it in GitHub Desktop.
Some hacky R code to explore confidence interval estimation for binomial rates, and the difference between two binomial rates.
alpha = 0.8
z <- 1.28
normal.theta.conf <- function(x, n) {
# normal approximation to a binomial. Generally agreed to be horrible
phat <- x / n
t <- z * sqrt(phat * (1 - phat)) / sqrt(n)
c(max(0,phat - t), min(1, phat + t))
}
cp.theta.conf <- function(x,n) {
# clopper pearson interval. guaranteed coverage, but maybe too broad
binom.test(x,n, conf.level=alpha)$conf.int
}
theta.covered <- function(p, conf.int) {
all(p > conf.int[1], p < conf.int[2])
}
theta.coverage.test <- function(p, n, T, estimators) {
data = rbinom(T,n,p)
coverage = sapply(data, function(x){
sapply(estimators, function(est){
conf.int <- est(x,n)
covered <- theta.covered(p, conf.int)
conf.length <- conf.int[2] - conf.int[1]
c(covered, conf.length)
})
})
r = cbind(data, t(coverage))
colnames(r) = c("x", as.character(sapply(names(estimators), function(x){paste(x, c(".covered", ".length"), sep="")})))
cs = colSums(r)
results = list()
for (i in 1:length(estimators)) {
obs.coverage = cs[i * 2] / T
avg.length = cs[i * 2 + 1] / T
t = c(obs.coverage, avg.length)
names(t) = c("coverage", "avg length")
results[[names(estimators)[i]]] = t
}
results
}
theta.estimators = c(normal.theta.conf, cp.theta.conf)
names(theta.estimators) = c("normal approx", "clopper pearson")
x <- seq(0.01, 0.1, 0.01)
low.p.coverage <- sapply(x, function(x){sapply(theta.coverage.test(x, 40, 1000, theta.estimators), function(y){y["coverage"]})})
plot(x, low.p.coverage[1,],'l', ylim=c(0,1), xlab="true p", ylab="coverage")
lines(x,low.p.coverage[2,], lty=2)
abline(h=0.8, lty=2)
legend("bottomright", c("normal approx", "clopper pearson"), lty=c(1,2))
normal.delta.conf <- function(x_1, n_1, x_2, n_2){
# Estimate theta_1 and theta_2
hat_theta_1 = x_1/n_1
hat_theta_2 = x_2/n_2
# Estimate \delta
hat_delta = hat_theta_1 - hat_theta_2
# Estimate the standard deviation of hat_delta
hat_std_hat_delta = sqrt(hat_theta_1 * (1-hat_theta_1)/n_1 + hat_theta_2 * (1-hat_theta_2)/n_2)
c(hat_delta - z * hat_std_hat_delta, hat_delta + z * hat_std_hat_delta)
}
ag.delta.conf <- function(x1,n1, x2, n2) {
normal.delta.conf(x1 + 1, n1 + 2, x2 + 1, n2 + 2)
}
delta.estimators <- list(normal.delta.conf, ag.delta.conf)
names(delta.estimators) <- c("normal approx", "+1 smoothing")
delta.coverage.test <- function(p1, n1, p2,n2, T, estimators) {
data1 = rbinom(T,n1,p1)
data2 = rbinom(T,n2,p2)
data = cbind(data1,data2)
true.delta = p1 - p2
coverage = sapply(1:T, function(i){
x1 = data[i,1]
x2 = data[i,2]
sapply(estimators, function(est){
conf.int <- est(x1,n1,x2,n2)
covered <- theta.covered(true.delta, conf.int)
conf.length <- conf.int[2] - conf.int[1]
c(covered, conf.length)
})
})
r = cbind(data, t(coverage))
colnames(r) = c("x1","x2", as.character(sapply(names(estimators), function(x){paste(x, c(".covered", ".length"), sep="")})))
cs = colSums(r)
results = list()
for (i in 1:length(estimators)) {
obs.coverage = cs[i * 2 + 1] / T
avg.length = cs[i * 2 + 2] / T
t = c(obs.coverage, avg.length)
names(t) = c("coverage", "avg length")
results[[names(estimators)[i]]] = t
}
results
}
x <- seq(10,100,10)
low.n.delta.coverage <- sapply(x, function(n1) {
sapply(delta.coverage.test(0.02, n1, 0.02, 2249, 1000, delta.estimators),
function(y) {y["coverage"]}
)
})
plot(x, low.n.delta.coverage[1,], 'l', ylim=c(0,1), xlab="n1", ylab="coverage",
main="Conf Interval coverage for p1 - p2",
sub="p1 = p2 = 0.02, n2 = 2249")
lines(x, low.n.delta.coverage[2,], lty=2)
abline(h=0.8, lty=3)
legend("bottomright", names(delta.estimators), lty=c(1,2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment