Skip to content

Instantly share code, notes, and snippets.

@bschneidr
Last active October 12, 2022 14:30
Show Gist options
  • Save bschneidr/6dd967643c9a829ec59e09a2954f920a to your computer and use it in GitHub Desktop.
Save bschneidr/6dd967643c9a829ec59e09a2954f920a to your computer and use it in GitHub Desktop.
Wilson's confidence interval for complex surveys
#' @title Wilson's confidence interval for complex survey designs
#' @description Calculate Wilson's confidence interval for a proportion,
#' with the effective sample size determined using a design-unbiased
#' estimate of the complex survey design effect.
#'
#' @param x A formula, vector, or matrix.
#' @param design A survey.design or svyrep.design object
#' @param na.rm Should cases with missing values be dropped?
#' @param level The confidence level required
#'
#' @return An object of class 'svyciprop'.
#' Use \code{as.numeric(coef(result))} to extract the point estimates,
#' and use \code{confint()} to extract the confidence intervals.
#' The variance-covariance matrix, standard errors, and design effects
#' can be extracted using the usual methods.
#' @export
#'
#' @examples
#' library(survey)
#'
#' data(api)
#' dclus1 <- svydesign(id=~dnum, fpc=~fpc, data=apiclus1)
#'
#' estimates <- svy_prop_with_wilson_ci(x = ~ stype,
#' na.rm = TRUE,
#' design = dclus1,
#' level = 0.95)
#' confint(estimates)
#' as.numeric(coef(estimates))
#' SE(estimates)
#' vcov(estimates)
#' deff(estimates)
#'
#' # Example with `svyby()`
#' svyby(~ awards, by = ~ stype, design = dclus1,
#' FUN = svy_prop_with_wilson_ci, vartype = c("ci", "se"))
#'
svy_prop_with_wilson_ci <- function(x, design, na.rm = FALSE, level = 0.95) {
# Extract the matrix of variables to use for the estimate
# (This code is taken directly from the 'svymean()' function)
# It's complicated because x can be a formula, a vector, a data frame, etc.
if (inherits(x, "formula")) {
mf <- model.frame(x, design$variables, na.action = na.pass)
xx <- lapply(
attr(terms(x), "variables")[-1],
function(tt) model.matrix(eval(bquote(~0 + .(tt))), mf)
)
cols <- sapply(xx, NCOL)
x <- matrix(nrow = NROW(xx[[1]]), ncol = sum(cols))
scols <- c(0, cumsum(cols))
for (i in 1:length(xx)) {
x[, scols[i] + 1:cols[i]] <- xx[[i]]
}
colnames(x) <- do.call("c", lapply(xx, colnames))
} else {
if (typeof(x) %in% c("expression", "symbol"))
x <- eval(x, design$variables)
else if (is.data.frame(x) && any(sapply(x, is.factor))) {
xx <- lapply(x, function(xi) {
if (is.factor(xi))
0 + (outer(xi, levels(xi), "=="))
else xi
})
cols <- sapply(xx, NCOL)
scols <- c(0, cumsum(cols))
cn <- character(sum(cols))
for (i in 1:length(xx)) cn[scols[i] + 1:cols[i]] <- paste(names(x)[i],
levels(x[[i]]), sep = "")
x <- matrix(nrow = NROW(xx[[1]]), ncol = sum(cols))
for (i in 1:length(xx)) {
x[, scols[i] + 1:cols[i]] <- xx[[i]]
}
colnames(x) <- cn
}
}
if (na.rm) {
nas <- rowSums(is.na(x))
design <- design[nas == 0, ]
if (length(nas) > length(design$prob))
x <- x[nas == 0, , drop = FALSE]
else x[nas > 0, ] <- 0
}
# Point estimates, standard errors, and design effects
estimate <- survey::svymean(x = x, na.rm = TRUE,
design = design,
deff = TRUE)
# Determine the sample size used for the estimate
n_for_estimate <- nrow(x)
# Obtain design-based estimate of the design effect
# and the effective sample size
design_effects <- survey::deff(estimate)
if (is.matrix(design_effects) && isSymmetric(design_effects)) {
design_effects <- diag(design_effects)
}
n_eff <- n_for_estimate / design_effects
# Calculate "effective number of successes"
p_hat <- coef(estimate)
q_hat <- 1 - p_hat
n_successes_eff <- p_hat * n_eff
# Determine z statistic to use
alpha <- (1 - level)
half_alpha <- alpha/2
z <- qnorm(p = half_alpha, lower.tail = FALSE)
# Calculate confidence interval
center <- (n_successes_eff + (z^2)/2)/(n_eff + (z^2))
width <- (z * sqrt(n_eff))/(n_eff + (z^2))
width <- width * sqrt((p_hat*q_hat) + (z^2)/(4*n_eff))
lower <- center - width
upper <- center + width
ci <- rbind(lower, upper)
rownames(ci) <- paste(round(c(half_alpha, level + half_alpha) *
100, 1), "%", sep = "")
colnames(ci) <- NULL
# Wrap up the result as an object of class 'svyciprop'
# so that S3 methods from the survey package can be used
# (e.g. print(), confint(), SE(), deff(), etc.)
result <- p_hat
attr(result, "var") <- vcov(estimate)
attr(result, 'deff') <- attr(estimate, 'deff')
attr(result, 'ci') <- ci
class(result) <- 'svyciprop'
return(result)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment