danlewer/direct_standardise.R

## direct_standardise.R
#-------------
# example data
#-------------

actual <- expand.grid(age = 1:3, sex = c('m', 'f')) # age groups 1:3
actual$n <- c(103, 313, 584, 606, 293, 101)
actual$true_prob <- c(0.1, 0.2, 0.4, 0.15, 0.25, 0.45)

dat <- actual
dat$sample_success <- rbinom(nrow(dat), dat$n, dat$true_prob) # study sample
dat$sample_prev <- dat$sample_success / dat$n

#------------------------------------------
# manual calculation of adjusted prevalence
#------------------------------------------

refpop <- aggregate(n ~ age, dat, sum)$n # reference population (whole sample)

x1 <- dat[dat$sex == 'm',]
x2 <- dat[dat$sex == 'f',]

sum(x1$sample_prev * refpop) / sum(refpop) # male
sum(x2$sample_prev * refpop) / sum(refpop) # female

#---------------------------------------------------------
# Method 1: Monte Carlo estimation of confidence intervals
#---------------------------------------------------------

mc <- function(x, refpop, N = 100000, point_estimate = mean, level = 0.95) {
  a <- rbinom(nrow(x) * N, x$n, x$sample_prev)
  a <- matrix(a, ncol = N) # randomly generated from binomial distribution
  a <- a / x$n # prevalence
  a <- a * refpop # number expected in refpop
  a <- colSums(a) / sum(refpop) # prevalences in refpop
  pe <- point_estimate(a)
  q <- c((1-level)/2, 1-(1-level)/2)
  c(pe, quantile(a, q)) # 95% CIs
}

mc(x1, refpop)
mc(x2, refpop)

#---------------------------------------------------------------------
# check that 95% confidence intervals include 'true' value 95% of time
#---------------------------------------------------------------------

# 'true' value
tv <- sum(actual$true_prob[actual$sex == 'm'] * refpop) / sum(refpop)

# many samples (using the male group only)
actual_m <- actual[actual$sex == 'm',]

NS <- 500
ms <- rbinom(nrow(actual_m) * NS, actual_m$n, actual_m$true_prob)
ms <- matrix(ms, ncol = NS)

f_trial <- function(ms_) {
  d <- cbind(actual_m, sample_success = ms_)
  d$sample_prev <- d$sample_success / d$n
  mc(d, refpop)
}

cis <- apply(ms, 2, f_trial)
cis <- cis[,order(cis[1,])]
cr <- tv >= cis[2,] & tv <= cis[3,]
mean(cr)

plot(1, type = 'n', xlim = c(0, NS + 1), ylim = c(0.15, 0.35), ylab = 'prevalence', xlab = 'sample')
points(seq_len(NS), cis[1,], pch = 19)
arrows(seq_len(NS), cis[2,], seq_len(NS), cis[3,], code = 3, angle = 90, length = 0.03)
abline(h = tv)

#---------------------------
# Method 2: survey weighting
#---------------------------

# Based on the method described by CDC:
# https://www.cdc.gov/nchs/tutorials/NHANES/NHANESAnalyses/agestandardization/age_standardization_intro.htm
library(survey)

# Expand data
fs <- function(AGE, SEX, n, success) {
  out <- data.frame(outcome = c(rep(1, success), rep(0, n - success)))
  out$age <- AGE
  out$sex <- SEX
  out
}
dat2 <- mapply(fs, AGE = dat$age, SEX = dat$sex, n = dat$n, success = dat$sample_success, SIMPLIFY = F)
dat2 <- do.call('rbind', dat2)

# Calculate standardised proportion
design <- svydesign(ids = ~1, strata = ~age, data = dat2)
stdes <- svystandardize(design, by = ~age, over = ~sex, population = refpop)
y <- svyby(~outcome, ~sex, svyciprop, design = stdes)
names(y)[3] <- 'se'
q <- qnorm(0.975)
y$lower <- y$outcome - q * y$se
y$upper <- y$outcome + q * y$se
y
	#-------------
	# example data
	#-------------

	actual <- expand.grid(age = 1:3, sex = c('m', 'f')) # age groups 1:3
	actual$n <- c(103, 313, 584, 606, 293, 101)
	actual$true_prob <- c(0.1, 0.2, 0.4, 0.15, 0.25, 0.45)

	dat <- actual
	dat$sample_success <- rbinom(nrow(dat), dat$n, dat$true_prob) # study sample
	dat$sample_prev <- dat$sample_success / dat$n

	#------------------------------------------
	# manual calculation of adjusted prevalence
	#------------------------------------------

	refpop <- aggregate(n ~ age, dat, sum)$n # reference population (whole sample)

	x1 <- dat[dat$sex == 'm',]
	x2 <- dat[dat$sex == 'f',]

	sum(x1$sample_prev * refpop) / sum(refpop) # male
	sum(x2$sample_prev * refpop) / sum(refpop) # female

	#---------------------------------------------------------
	# Method 1: Monte Carlo estimation of confidence intervals
	#---------------------------------------------------------

	mc <- function(x, refpop, N = 100000, point_estimate = mean, level = 0.95) {
	a <- rbinom(nrow(x) * N, x$n, x$sample_prev)
	a <- matrix(a, ncol = N) # randomly generated from binomial distribution
	a <- a / x$n # prevalence
	a <- a * refpop # number expected in refpop
	a <- colSums(a) / sum(refpop) # prevalences in refpop
	pe <- point_estimate(a)
	q <- c((1-level)/2, 1-(1-level)/2)
	c(pe, quantile(a, q)) # 95% CIs
	}

	mc(x1, refpop)
	mc(x2, refpop)

	#---------------------------------------------------------------------
	# check that 95% confidence intervals include 'true' value 95% of time
	#---------------------------------------------------------------------

	# 'true' value
	tv <- sum(actual$true_prob[actual$sex == 'm'] * refpop) / sum(refpop)

	# many samples (using the male group only)
	actual_m <- actual[actual$sex == 'm',]

	NS <- 500
	ms <- rbinom(nrow(actual_m) * NS, actual_m$n, actual_m$true_prob)
	ms <- matrix(ms, ncol = NS)

	f_trial <- function(ms_) {
	d <- cbind(actual_m, sample_success = ms_)
	d$sample_prev <- d$sample_success / d$n
	mc(d, refpop)
	}

	cis <- apply(ms, 2, f_trial)
	cis <- cis[,order(cis[1,])]
	cr <- tv >= cis[2,] & tv <= cis[3,]
	mean(cr)

	plot(1, type = 'n', xlim = c(0, NS + 1), ylim = c(0.15, 0.35), ylab = 'prevalence', xlab = 'sample')
	points(seq_len(NS), cis[1,], pch = 19)
	arrows(seq_len(NS), cis[2,], seq_len(NS), cis[3,], code = 3, angle = 90, length = 0.03)
	abline(h = tv)

	#---------------------------
	# Method 2: survey weighting
	#---------------------------

	# Based on the method described by CDC:
	# https://www.cdc.gov/nchs/tutorials/NHANES/NHANESAnalyses/agestandardization/age_standardization_intro.htm
	library(survey)

	# Expand data
	fs <- function(AGE, SEX, n, success) {
	out <- data.frame(outcome = c(rep(1, success), rep(0, n - success)))
	out$age <- AGE
	out$sex <- SEX
	out
	}
	dat2 <- mapply(fs, AGE = dat$age, SEX = dat$sex, n = dat$n, success = dat$sample_success, SIMPLIFY = F)
	dat2 <- do.call('rbind', dat2)

	# Calculate standardised proportion
	design <- svydesign(ids = ~1, strata = ~age, data = dat2)
	stdes <- svystandardize(design, by = ~age, over = ~sex, population = refpop)
	y <- svyby(~outcome, ~sex, svyciprop, design = stdes)
	names(y)[3] <- 'se'
	q <- qnorm(0.975)
	y$lower <- y$outcome - q * y$se
	y$upper <- y$outcome + q * y$se
	y