bschneidr/wilson-interval-for-complex-surveys.R

## wilson-interval-for-complex-surveys.R
#' @title Wilson's confidence interval for complex survey designs
#' @description Calculate Wilson's confidence interval for a proportion,
#' with the effective sample size determined using a design-unbiased
#' estimate of the complex survey design effect.
#'
#' @param x A formula, vector, or matrix.
#' @param design A survey.design or svyrep.design object
#' @param na.rm Should cases with missing values be dropped?
#' @param level The confidence level required
#'
#' @return An object of class 'svyciprop'.
#' Use \code{as.numeric(coef(result))} to extract the point estimates,
#' and use \code{confint()} to extract the confidence intervals.
#' The variance-covariance matrix, standard errors, and design effects
#' can be extracted using the usual methods.
#' @export
#'
#' @examples
#' library(survey)
#'
#' data(api)
#' dclus1 <- svydesign(id=~dnum, fpc=~fpc, data=apiclus1)
#'
#' estimates <- svy_prop_with_wilson_ci(x = ~ stype,
#'                                      na.rm = TRUE,
#'                                      design = dclus1,
#'                                      level = 0.95)
#' confint(estimates)
#' as.numeric(coef(estimates))
#' SE(estimates)
#' vcov(estimates)
#' deff(estimates)
#'
#' # Example with `svyby()`
#' svyby(~ awards, by = ~ stype, design = dclus1,
#'       FUN = svy_prop_with_wilson_ci, vartype = c("ci", "se"))
#'
svy_prop_with_wilson_ci <- function(x, design, na.rm = FALSE, level = 0.95) {
  # Extract the matrix of variables to use for the estimate
  # (This code is taken directly from the 'svymean()' function)
  # It's complicated because x can be a formula, a vector, a data frame, etc.
  if (inherits(x, "formula")) {
    mf <- model.frame(x, design$variables, na.action = na.pass)
    xx <- lapply(
      attr(terms(x), "variables")[-1],
      function(tt) model.matrix(eval(bquote(~0 + .(tt))), mf)
    )
    cols <- sapply(xx, NCOL)
    x <- matrix(nrow = NROW(xx[[1]]), ncol = sum(cols))
    scols <- c(0, cumsum(cols))
    for (i in 1:length(xx)) {
      x[, scols[i] + 1:cols[i]] <- xx[[i]]
    }
    colnames(x) <- do.call("c", lapply(xx, colnames))
  } else {
    if (typeof(x) %in% c("expression", "symbol"))
      x <- eval(x, design$variables)
    else if (is.data.frame(x) && any(sapply(x, is.factor))) {
      xx <- lapply(x, function(xi) {
        if (is.factor(xi))
          0 + (outer(xi, levels(xi), "=="))
        else xi
      })
      cols <- sapply(xx, NCOL)
      scols <- c(0, cumsum(cols))
      cn <- character(sum(cols))
      for (i in 1:length(xx)) cn[scols[i] + 1:cols[i]] <- paste(names(x)[i],
                                                                levels(x[[i]]), sep = "")
      x <- matrix(nrow = NROW(xx[[1]]), ncol = sum(cols))
      for (i in 1:length(xx)) {
        x[, scols[i] + 1:cols[i]] <- xx[[i]]
      }
      colnames(x) <- cn
    }
  }
  if (na.rm) {
    nas <- rowSums(is.na(x))
    design <- design[nas == 0, ]
    if (length(nas) > length(design$prob))
      x <- x[nas == 0, , drop = FALSE]
    else x[nas > 0, ] <- 0
  }
  # Point estimates, standard errors, and design effects
  estimate <- survey::svymean(x = x, na.rm = TRUE,
                              design = design,
                              deff = TRUE)

  # Determine the sample size used for the estimate
  n_for_estimate <- nrow(x)

  # Obtain design-based estimate of the design effect
  # and the effective sample size
  design_effects <- survey::deff(estimate)
  if (is.matrix(design_effects) && isSymmetric(design_effects)) {
    design_effects <- diag(design_effects)
  }
  n_eff <- n_for_estimate / design_effects

  # Calculate "effective number of successes"
  p_hat <- coef(estimate)
  q_hat <- 1 - p_hat
  n_successes_eff <- p_hat * n_eff

  # Determine z statistic to use
  alpha <- (1 - level)
  half_alpha <- alpha/2
  z <- qnorm(p = half_alpha, lower.tail = FALSE)

  # Calculate confidence interval
  center <- (n_successes_eff + (z^2)/2)/(n_eff + (z^2))
  width <- (z * sqrt(n_eff))/(n_eff + (z^2))
  width <- width * sqrt((p_hat*q_hat) + (z^2)/(4*n_eff))

  lower <- center - width
  upper <- center + width
  ci <- rbind(lower, upper)
  rownames(ci) <- paste(round(c(half_alpha, level + half_alpha) *
                                100, 1), "%", sep = "")
  colnames(ci) <- NULL

  # Wrap up the result as an object of class 'svyciprop'
  # so that S3 methods from the survey package can be used
  # (e.g. print(), confint(), SE(), deff(), etc.)
  result <- p_hat
  attr(result, "var") <- vcov(estimate)
  attr(result, 'deff') <- attr(estimate, 'deff')
  attr(result, 'ci') <- ci
  class(result) <- 'svyciprop'
  return(result)
}
	#' @title Wilson's confidence interval for complex survey designs
	#' @description Calculate Wilson's confidence interval for a proportion,
	#' with the effective sample size determined using a design-unbiased
	#' estimate of the complex survey design effect.
	#'
	#' @param x A formula, vector, or matrix.
	#' @param design A survey.design or svyrep.design object
	#' @param na.rm Should cases with missing values be dropped?
	#' @param level The confidence level required
	#'
	#' @return An object of class 'svyciprop'.
	#' Use \code{as.numeric(coef(result))} to extract the point estimates,
	#' and use \code{confint()} to extract the confidence intervals.
	#' The variance-covariance matrix, standard errors, and design effects
	#' can be extracted using the usual methods.
	#' @export
	#'
	#' @examples
	#' library(survey)
	#'
	#' data(api)
	#' dclus1 <- svydesign(id=~dnum, fpc=~fpc, data=apiclus1)
	#'
	#' estimates <- svy_prop_with_wilson_ci(x = ~ stype,
	#' na.rm = TRUE,
	#' design = dclus1,
	#' level = 0.95)
	#' confint(estimates)
	#' as.numeric(coef(estimates))
	#' SE(estimates)
	#' vcov(estimates)
	#' deff(estimates)
	#'
	#' # Example with `svyby()`
	#' svyby(~ awards, by = ~ stype, design = dclus1,
	#' FUN = svy_prop_with_wilson_ci, vartype = c("ci", "se"))
	#'
	svy_prop_with_wilson_ci <- function(x, design, na.rm = FALSE, level = 0.95) {
	# Extract the matrix of variables to use for the estimate
	# (This code is taken directly from the 'svymean()' function)
	# It's complicated because x can be a formula, a vector, a data frame, etc.
	if (inherits(x, "formula")) {
	mf <- model.frame(x, design$variables, na.action = na.pass)
	xx <- lapply(
	attr(terms(x), "variables")[-1],
	function(tt) model.matrix(eval(bquote(~0 + .(tt))), mf)
	)
	cols <- sapply(xx, NCOL)
	x <- matrix(nrow = NROW(xx[[1]]), ncol = sum(cols))
	scols <- c(0, cumsum(cols))
	for (i in 1:length(xx)) {
	x[, scols[i] + 1:cols[i]] <- xx[[i]]
	}
	colnames(x) <- do.call("c", lapply(xx, colnames))
	} else {
	if (typeof(x) %in% c("expression", "symbol"))
	x <- eval(x, design$variables)
	else if (is.data.frame(x) && any(sapply(x, is.factor))) {
	xx <- lapply(x, function(xi) {
	if (is.factor(xi))
	0 + (outer(xi, levels(xi), "=="))
	else xi
	})
	cols <- sapply(xx, NCOL)
	scols <- c(0, cumsum(cols))
	cn <- character(sum(cols))
	for (i in 1:length(xx)) cn[scols[i] + 1:cols[i]] <- paste(names(x)[i],
	levels(x[[i]]), sep = "")
	x <- matrix(nrow = NROW(xx[[1]]), ncol = sum(cols))
	for (i in 1:length(xx)) {
	x[, scols[i] + 1:cols[i]] <- xx[[i]]
	}
	colnames(x) <- cn
	}
	}
	if (na.rm) {
	nas <- rowSums(is.na(x))
	design <- design[nas == 0, ]
	if (length(nas) > length(design$prob))
	x <- x[nas == 0, , drop = FALSE]
	else x[nas > 0, ] <- 0
	}
	# Point estimates, standard errors, and design effects
	estimate <- survey::svymean(x = x, na.rm = TRUE,
	design = design,
	deff = TRUE)

	# Determine the sample size used for the estimate
	n_for_estimate <- nrow(x)

	# Obtain design-based estimate of the design effect
	# and the effective sample size
	design_effects <- survey::deff(estimate)
	if (is.matrix(design_effects) && isSymmetric(design_effects)) {
	design_effects <- diag(design_effects)
	}
	n_eff <- n_for_estimate / design_effects

	# Calculate "effective number of successes"
	p_hat <- coef(estimate)
	q_hat <- 1 - p_hat
	n_successes_eff <- p_hat * n_eff

	# Determine z statistic to use
	alpha <- (1 - level)
	half_alpha <- alpha/2
	z <- qnorm(p = half_alpha, lower.tail = FALSE)

	# Calculate confidence interval
	center <- (n_successes_eff + (z^2)/2)/(n_eff + (z^2))
	width <- (z * sqrt(n_eff))/(n_eff + (z^2))
	width <- width * sqrt((p_hatq_hat) + (z^2)/(4n_eff))

	lower <- center - width
	upper <- center + width
	ci <- rbind(lower, upper)
	rownames(ci) <- paste(round(c(half_alpha, level + half_alpha) *
	100, 1), "%", sep = "")
	colnames(ci) <- NULL

	# Wrap up the result as an object of class 'svyciprop'
	# so that S3 methods from the survey package can be used
	# (e.g. print(), confint(), SE(), deff(), etc.)
	result <- p_hat
	attr(result, "var") <- vcov(estimate)
	attr(result, 'deff') <- attr(estimate, 'deff')
	attr(result, 'ci') <- ci
	class(result) <- 'svyciprop'
	return(result)
	}