Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active October 17, 2022 21:46
Show Gist options
  • Save BioSciEconomist/c494ff12b5589c1c9e712ef38ec53724 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/c494ff12b5589c1c9e712ef38ec53724 to your computer and use it in GitHub Desktop.
Prospectively determine expected CI width (margin of error) based on power calc
# *-----------------------------------------------------------------
# | PROGRAM NAME: calc CI widths
# | DATE: 10/16/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: prespectively determine expected CI width (margin of error) based on power calc
# *----------------------------------------------------------------
# based on: Goodman SN, Berlin JA. The use of predicted confidence intervals when planning experiments and
# the misuse of power when interpreting results. Ann Intern Med. 1994 Aug 1;121(3):200-6.
# doi: 10.7326/0003-4819-121-3-199408010-00008. Erratum in: Ann Intern Med 1995 Mar 15;122(6):478. PMID: 8017747.
# simualtions are used to provide additional intuition of the approach and extended for cases based on
# 70% confidence intervals and 80% power
# according to Goodman we can predict the width or ME as follows:
# +/- .70 *delta_80
# +/- .60 *delta_90
# where delta_80 is the true difference for which we have 80% power to detect with 95% confidence
# where delta_90 is the true difference for which we have 90% power to detect with 95% confidence
library(pwr)
#------------------------------
#
# example from paper
#
#-------------------------------
# calculate expected or hypothesized effect size of 25 PP from example
p1 <- .45 # basline
p2 <- .70 # improvement
(h = 2*asin(sqrt(p1)) - 2*asin(sqrt(p2)))
pwr.2p.test(h=.51, sig.level=0.05,power=0.90)
# n = 80
# with this sample size and expected effect size we can prospectively calclulate the expected width
# or margin of error for a 95% CI: ME = obs +/1 .6*delta_80 where delta_80 is the effect
# size expressed as an absolute percentage point difference
delta_90 <- .25
.6*delta_90
# ME = obs +/- .15
# additional examples (figure 1)
# what woudl CIs look like for observed difference of 15?
obs <- .15 # hypotehtical observed outcome (we never know this)
delta_90 <- .25 # what we are powered to detect based on business relevance
ME <- .6*delta_90
CI_lower <- obs - ME
CI_upper <- obs + ME
print(CI_lower)
print(CI_upper)
#------------------------------
#
# example- 95% CI to detect a 3PP increase given baseline of .50 w/80% power
#
#-------------------------------
# what is the requried n?
p1 <- .50 # basline
p2 <- .53 # improvement
(h = 2*asin(sqrt(p1)) - 2*asin(sqrt(p2)))
pwr.2p.test(h=.06, sig.level=0.05,power=0.80)
# n = 4361
# how wide do we expect the CI to be?
delta_80 <- .03 # what we are powered to detect based on business relevance
ME <- .7*delta_90 # here we use multiplicative factor of .7 for 80% power
print(ME)
# so if we actually found an impact of .025 what's our predicted CI?
obs <- .025 # hypotehtical observed outcome (we never know this)
CI_lower <- obs - ME
CI_upper <- obs + ME
print(CI_lower)
print(CI_upper)
#------------------------------
#
# example- 70% CI to detect a 3PP increase given baseline of .50 w/80% power
#
#-------------------------------
p1 <- .50 # basline
p2 <- .53 # improvement
(h = 2*asin(sqrt(p1)) - 2*asin(sqrt(p2)))
pwr.2p.test(h=.06, sig.level=0.30,power=0.80)
# n = 1945
# how wide do we expect the CI to be?
# we need to solve for the multiplicative factor related to 80% power & 70% confidence
# from the appendix we get the following derivation for 80% power & 95% confidence:
# delta_1_minus_B = (Z_alpha/2 + Z_beta)*SE
#
# where:
#
# 1 - B = power
# delta_1_minus_B = difference that can be detected with power 1- B
# Z_alpha/2 = Z score for 2 sided type II error of alpha
# Z_beta = Z score associated with one sided type II error of B
#
# For a one sided power of 80% and 2 sided alpha = 5%
# Z_alpha/2 + Z_beta = 1.96 + .84 = 2.8
#
# delta_80 is 2.8 SEs from the null hypothesis
#
# The predicted 95% CI equals the observed difference +/ 1.96 SE
# but we can replace that SE with delta_80/2.8
#
# Predicted 95% CI = obs difference +/- 1.96*(delta_80/2.8)
#
# = obs difference +/- .70 *delta_80
#
# We can similarly derive: obs difference +/- .60 *delta_90
#
#
# For 80% power and 70% confidence we make the following substitution:
#
# Z_alpha/2 + Z_beta = 1.036 + .84 = 1.876
#
# and it follows:
#
# Predicted 95% CI = obs difference +/- 1.036*(delta_80/1.876)
#
# = obs difference +/- .55 *delta_80
delta_80 <- .03 # what we are powered to detect based on business relevance
ME <- .55*delta_80 # here we use multiplicative factor of .55 solved for 80% power w/70% confidence
print(ME)
# so if we actually found an impact of .025 what's our predicted CI?
obs <- .025 # hypotehtical observed outcome (we never know this)
CI_lower <- obs - ME
CI_upper <- obs + ME
print(CI_lower)
print(CI_upper)
# ----------------------------------------------
#
#
# explore via simulation
#
#
# ----------------------------------------------
# simualte data based on linear probability function
# set.seed(123)
# x<-runif(1000,-1,1) # running varaible with cutoff
# pr <- .25 + .25*x +.15*(x>=0) + rnorm(1000,0,.05) # outcome with discontinuity at cutoff (x > 0)
# y = rbinom(1000,1,pr) # bernoulli response variable
# dat <- data.frame(pr,x,cov,y) # create data frame
# dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0)
#
# treatent group
#
pr <- .53 # mean
# simulate costs based on parameter assumptions above
set.seed(1234567)
enroll <- rbinom(100000,1,pr)
# check descriptive results
summary(enroll)
# simulate ID and treatment indicator
ID <- seq(1,100000)
D <- rep(1,100000)
trt <- data.frame(ID,enroll,D) # combine metrics
#
# control group
#
pr <- .50 # mean
# simulate costs based on parameter assumptions above
set.seed(1234567)
enroll <- rbinom(100000,1,pr)
# check descriptive results
summary(enroll)
# simulate ID and treatment indicator
ID <- seq(1,100000)
D <- rep(0,100000)
ctrl <- data.frame(ID,enroll,D) # combine metrics
#
# create population data frame
#
df <- rbind(trt,ctrl)
# check descriptives
summary(df)
df%>%
group_by(D) %>%
summarize(y = mean(enroll),
n = n())
summary(lm(enroll ~ D, data = df)) # regression on full model
simdatA <- df # we will use this simulated data below
#
# what sample size do we need to measure this effect with 80% power and 95% confidence
#
library(pwr)
# what is the requried n?
p1 <- .50 # basline
p2 <- .53 # improvement
(h = 2*asin(sqrt(p1)) - 2*asin(sqrt(p2)))
pwr.2p.test(h=.06, sig.level=0.05,power=0.80)
# n = 4361
# -----------------------------------------
#
# CI projection 80% power 95% confidence
#
# -----------------------------------------
# how wide do we expect the CI to be?
delta_80 <- .03 # what we are powered to detect based on business relevance
ME <- .7*delta_80 # here we use multiplicative factor 80% power
print(ME) # +/- .02
# so if we actually found an impact as large as our definition of success (.03) what's our predicted CI?
obs <- .03 # hypotehtical observed outcome (we never know this)
CI_lower <- obs - ME
CI_upper <- obs + ME
print(CI_lower)
print(CI_upper)
#
# calclate observed CI from fully powered sample
#
Sn <- 4361 # fully powered sample
# create treatment and control groups
treat <- simdatA[simdatA$D == 1,]
ctrl <- simdatA[simdatA$D == 0,]
set.seed(9876) # make this repeatable 1234567
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
set.seed(9876) # make this repeatable 1234567
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # sample data frame
(summ <- summary(lm(enroll ~ D, data = bs))) # our estimator
se <- coef(summ)[4] # SE
est <- coef(summ)[2] # est
(LB <- est - 1.96*se)
(UB <- est + 1.96*se)
(MEobs <- 1.96*se)
#
# does this hold in simulation
#
set.seed(9876) # make this repeatable 1234567
Sn = 4361
MEobs <- c()
repNum <- c()
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:100) {
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # sample
(summ <- summary(lm(enroll ~ D, data = bs))) # our estimator
se <- coef(summ)[4] # SE
me <- 1.96*se
MEobs <- c(MEobs, me)
repNum <- c(repNum,i)
print(cbind("Iteration:", i)) # comment this out to stop printing to log
}
rpt <- data.frame(repNum,MEobs)
rm(se,i,sample1,sample2,bs,summ,bhat,MEobs) # cleanup
summary(rpt)
# -----------------------------------------
#
# CI projection 90% power 95% confidence
#
# -----------------------------------------
# how wide do we expect the CI to be?
delta_90 <- .03 # what we are powered to detect based on business relevance
ME <- .6*delta_90 # here we use multiplicative factor 80% power
print(ME) # +/- .018
# so if we actually found an impact as large as our definition of success (.03) what's our predicted CI?
obs <- .03 # hypotehtical observed outcome (we never know this)
CI_lower <- obs - ME
CI_upper <- obs + ME
print(CI_lower)
print(CI_upper)
#
# calclate observed CI from fully powered sample
#
# what is the requried n?
p1 <- .50 # basline
p2 <- .53 # improvement
(h = 2*asin(sqrt(p1)) - 2*asin(sqrt(p2)))
pwr.2p.test(h=.06, sig.level=0.05,power=0.90)
# n = 5838
Sn <- 5838 # fully powered sample
# create treatment and control groups
treat <- simdatA[simdatA$D == 1,]
ctrl <- simdatA[simdatA$D == 0,]
set.seed(9876) # make this repeatable 1234567
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
set.seed(9876) # make this repeatable 1234567
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # sample data frame
(summ <- summary(lm(enroll ~ D, data = bs))) # our estimator
se <- coef(summ)[4] # SE
est <- coef(summ)[2] # est
(LB <- est - 1.96*se)
(UB <- est + 1.96*se)
(MEobs <- 1.96*se)
#
# does this hold in simulation
#
set.seed(9876) # make this repeatable 1234567
Sn = 5838
MEobs <- c()
repNum <- c()
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:100) {
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # sample
(summ <- summary(lm(enroll ~ D, data = bs))) # our estimator
se <- coef(summ)[4] # SE
me <- 1.96*se
MEobs <- c(MEobs, me)
repNum <- c(repNum,i)
print(cbind("Iteration:", i)) # comment this out to stop printing to log
}
rpt <- data.frame(repNum,MEobs)
summary(rpt)
rm(se,i,sample1,sample2,bs,summ,bhat,MEobs) # cleanup
# -----------------------------------------
#
# CI projection 80% power 70% confidence
#
# -----------------------------------------
# how wide do we expect the CI to be?
delta_80 <- .03 # what we are powered to detect based on business relevance
ME <- .55*delta_80 # here we use multiplicative factor of .55 solved for 70% power
print(ME) # +/- .0165
#
# calclate observed CI from fully powered sample
#
# what is the requried n?
p1 <- .50 # basline
p2 <- .53 # improvement
(h = 2*asin(sqrt(p1)) - 2*asin(sqrt(p2)))
pwr.2p.test(h=.06, sig.level=0.30,power=0.80)
# n = 1945
Sn <- 1945 # fully powered sample
# create treatment and control groups
treat <- simdatA[simdatA$D == 1,]
ctrl <- simdatA[simdatA$D == 0,]
set.seed(9876) # make this repeatable 1234567
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
set.seed(9876) # make this repeatable 1234567
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # sample data frame
(summ <- summary(lm(enroll ~ D, data = bs))) # our estimator
se <- coef(summ)[4] # SE
(MEobs <- 1.036*se) # observed ME for 70% CI
#
# does this hold in simulation
#
set.seed(9876) # make this repeatable 1234567
Sn = 1945
MEobs <- c()
repNum <- c()
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:100) {
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # sample
(summ <- summary(lm(enroll ~ D, data = bs))) # our estimator
se <- coef(summ)[4] # SE
me <- 1.036*se
MEobs <- c(MEobs, me)
repNum <- c(repNum,i)
print(cbind("Iteration:", i)) # comment this out to stop printing to log
}
rpt <- data.frame(repNum,MEobs)
summary(rpt)
rm(se,i,sample1,sample2,bs,summ,MEobs) # cleanup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment