Skip to content

Instantly share code, notes, and snippets.

@danlewer
Last active January 29, 2023 09:10
Show Gist options
  • Save danlewer/0fddbae2bade5d9be6af3620591c84ca to your computer and use it in GitHub Desktop.
Save danlewer/0fddbae2bade5d9be6af3620591c84ca to your computer and use it in GitHub Desktop.
Estimation of confidence intervals for directly standardised proportions / prevalences
#-------------
# example data
#-------------
actual <- expand.grid(age = 1:3, sex = c('m', 'f')) # age groups 1:3
actual$n <- c(103, 313, 584, 606, 293, 101)
actual$true_prob <- c(0.1, 0.2, 0.4, 0.15, 0.25, 0.45)
dat <- actual
dat$sample_success <- rbinom(nrow(dat), dat$n, dat$true_prob) # study sample
dat$sample_prev <- dat$sample_success / dat$n
#------------------------------------------
# manual calculation of adjusted prevalence
#------------------------------------------
refpop <- aggregate(n ~ age, dat, sum)$n # reference population (whole sample)
x1 <- dat[dat$sex == 'm',]
x2 <- dat[dat$sex == 'f',]
sum(x1$sample_prev * refpop) / sum(refpop) # male
sum(x2$sample_prev * refpop) / sum(refpop) # female
#---------------------------------------------------------
# Method 1: Monte Carlo estimation of confidence intervals
#---------------------------------------------------------
mc <- function(x, refpop, N = 100000, point_estimate = mean, level = 0.95) {
a <- rbinom(nrow(x) * N, x$n, x$sample_prev)
a <- matrix(a, ncol = N) # randomly generated from binomial distribution
a <- a / x$n # prevalence
a <- a * refpop # number expected in refpop
a <- colSums(a) / sum(refpop) # prevalences in refpop
pe <- point_estimate(a)
q <- c((1-level)/2, 1-(1-level)/2)
c(pe, quantile(a, q)) # 95% CIs
}
mc(x1, refpop)
mc(x2, refpop)
#---------------------------------------------------------------------
# check that 95% confidence intervals include 'true' value 95% of time
#---------------------------------------------------------------------
# 'true' value
tv <- sum(actual$true_prob[actual$sex == 'm'] * refpop) / sum(refpop)
# many samples (using the male group only)
actual_m <- actual[actual$sex == 'm',]
NS <- 500
ms <- rbinom(nrow(actual_m) * NS, actual_m$n, actual_m$true_prob)
ms <- matrix(ms, ncol = NS)
f_trial <- function(ms_) {
d <- cbind(actual_m, sample_success = ms_)
d$sample_prev <- d$sample_success / d$n
mc(d, refpop)
}
cis <- apply(ms, 2, f_trial)
cis <- cis[,order(cis[1,])]
cr <- tv >= cis[2,] & tv <= cis[3,]
mean(cr)
plot(1, type = 'n', xlim = c(0, NS + 1), ylim = c(0.15, 0.35), ylab = 'prevalence', xlab = 'sample')
points(seq_len(NS), cis[1,], pch = 19)
arrows(seq_len(NS), cis[2,], seq_len(NS), cis[3,], code = 3, angle = 90, length = 0.03)
abline(h = tv)
#---------------------------
# Method 2: survey weighting
#---------------------------
# Based on the method described by CDC:
# https://www.cdc.gov/nchs/tutorials/NHANES/NHANESAnalyses/agestandardization/age_standardization_intro.htm
library(survey)
# Expand data
fs <- function(AGE, SEX, n, success) {
out <- data.frame(outcome = c(rep(1, success), rep(0, n - success)))
out$age <- AGE
out$sex <- SEX
out
}
dat2 <- mapply(fs, AGE = dat$age, SEX = dat$sex, n = dat$n, success = dat$sample_success, SIMPLIFY = F)
dat2 <- do.call('rbind', dat2)
# Calculate standardised proportion
design <- svydesign(ids = ~1, strata = ~age, data = dat2)
stdes <- svystandardize(design, by = ~age, over = ~sex, population = refpop)
y <- svyby(~outcome, ~sex, svyciprop, design = stdes)
names(y)[3] <- 'se'
q <- qnorm(0.975)
y$lower <- y$outcome - q * y$se
y$upper <- y$outcome + q * y$se
y
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment