Last active
July 19, 2022 15:18
-
-
Save BioSciEconomist/1bf2fcc834ffa88b4cf810c27be5af80 to your computer and use it in GitHub Desktop.
How much confidence can we place in the range of values in a CI vs the point estimate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: CI Simulation | |
# | DATE: 7/16/22 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: How much confidence can we place in the range of values in a CI vs the point estimate | |
# *---------------------------------------------------------------- | |
rm(list=ls()) # get rid of any existing data | |
cat("\f") # clear console | |
dev.off() # clear plots | |
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
library(MASS) | |
# ---------------------------------------------- | |
# | |
# | |
# create super population | |
# | |
# | |
# ---------------------------------------------- | |
# | |
# treatent group | |
# | |
mu <- 3500 # mean 4000 | |
sd <- 10500 # standard deviation 3 x mean | |
cv <- sd/mu # CV | |
kappa <- 1/(cv^2) # shape parameter | |
theta <- (sd*sd)/mu # this is theta or scale parameter | |
# simulate costs based on parameter assumptions above | |
set.seed(1234567) | |
cost <- rgamma(100000, shape = kappa, scale = theta) | |
# check descriptive results | |
summary(cost) | |
sd(cost) | |
hist(cost, breaks = 50) | |
# simulate ID and treatment indicator | |
ID <- seq(1,100000) | |
D <- rep(1,100000) | |
trt <- data.frame(ID,cost,D) # combine metrics | |
# | |
# control group | |
# | |
mu <- 4000 # mean 6000 | |
sd <- 12000 # standard deviation 3 x mean | |
cv <- sd/mu # CV | |
kappa <- 1/(cv^2) # shape parameter | |
theta <- (sd*sd)/mu # this is theta or scale parameter | |
# simulate costs based on parameter assumptions above | |
set.seed(1234567) | |
cost <- rgamma(100000, shape = kappa, scale = theta) | |
# check descriptive results | |
summary(cost) | |
sd(cost) | |
hist(cost, breaks = 50) | |
# simulate ID and treatment indicator | |
ID <- seq(100001,200000) | |
D <- rep(0,100000) | |
ctrl <- data.frame(ID,cost,D) # combine metrics | |
# | |
# create population data frame | |
# | |
df <- rbind(trt,ctrl) | |
# check descriptives | |
summary(df) | |
df%>% | |
group_by(D) %>% | |
summarize(y = mean(cost), | |
n = n()) | |
hist(df$cost, breaks = 50) | |
summary(lm(cost ~ D, data = df)) # regression on full model | |
simdatA <- df # we will use this simulated data below | |
# | |
# what sample size do we need to measure this effect with 80% power and 95% confidence | |
# | |
library(pwr) | |
# calculate effect size 'd' | |
sigmaA = sd(simdatA$cost[simdatA$D ==1]) # standard deviation for group A | |
sigmaB = sd(simdatA$cost[simdatA$D ==0]) # standard deviation for group B (for now assume common variance between groups) | |
Q0 = .5 # proportion in the control group (.5 implies equal sized groups) | |
sigma_pooled = sqrt(((sigmaA)^2*(1/Q0 -1 ) + (sigmaB)^2*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation | |
# how large of a sample for $500 | |
d = -500/sigma_pooled | |
pwr.t.test(n = , d =d , sig.level = .05 , power = .8 ,type = c("two.sample")) | |
# how much power do we have with varying sample sizes | |
# pwr.t.test(n = 1000, d =d , sig.level = .05 , power = ,type = c("two.sample")) | |
# n = 8000 80% | |
# n = 7000 74% | |
# n = 6000 68% | |
# n = 5000 60% | |
# n = 4000 50% | |
# n = 3000 40% | |
# n = 2000 30% | |
# n = 1000 15% | |
# ----------------------------------------- | |
# CI simulation for scenario 1 | |
# ----------------------------------------- | |
Sn <- 8000 # 8000 fully powered sample | |
# create treatment and control groups | |
treat <- simdatA[simdatA$D == 1,] | |
ctrl <- simdatA[simdatA$D == 0,] | |
set.seed(9876) # make this repeatable 1234567 | |
pvals <-c() | |
bhat <- c() | |
CI_Lower <- c() | |
CI_Upper <- c() | |
repNum <- c() | |
bstrap <- c() # initialize vector for bootstrapped estimates | |
for (i in 1:100) { | |
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group | |
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group | |
bs <- rbind(sample1,sample2) # sample | |
(summ <- summary(lm(cost ~ D, data = bs))) # our estimator | |
se <- coef(summ)[4] # SE | |
est <- coef(summ)[2] # est | |
pval <- coef(summ)[8] # pvalue | |
LB <- est - 1.96*se | |
UB <- est + 1.96*se | |
bhat <- c(bhat,est) | |
pvals <- c(pvals,pval) | |
CI_Lower <- c(CI_Lower, LB) | |
CI_Upper <- c(CI_Upper, UB) | |
repNum <- c(repNum,i) # get p-value from this bs sample | |
print(cbind("Iteration:", i)) # comment this out to stop printing to log | |
} | |
rpt <- data.frame(repNum,bhat,pvals,CI_Lower,CI_Upper) | |
rm(LB,UB,se,est,i,sample1,sample2,bs,summ,bhat) # cleanup | |
k <- -500 # population parameter we are estimating | |
rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k' | |
rpt$sig <- ifelse(rpt$pvals < .05,1,0 ) # estimate is statistically different from 0 | |
rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound | |
rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound | |
rpt$diffEst <- abs(k - rpt$bhat) # difference between point estimate and k | |
rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k | |
summary(rpt) # check | |
plot(rpt$bhat, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))), | |
xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n') | |
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1) | |
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1) | |
abline(h = -500) | |
# order by estimate | |
tmp <- rpt[order(rpt$bhat),] | |
plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))), | |
xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n') | |
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1) | |
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1) | |
abline(h = -500) | |
# separate intervals where estimate is closer to k than endpoints | |
tmp <- tmp[order(tmp$est_closer),] | |
plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))), | |
xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n') | |
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1) | |
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1) | |
abline(h = -500) | |
capture <- c() | |
tmp <- rpt | |
for (i in 1:100) { | |
chk <- rpt[rpt$repNum ==i,] # select CI | |
LB <-chk$CI_Lower | |
UB <- chk$CI_Upper | |
tmp$capture <- ifelse((tmp$bhat > LB & tmp$bhat < UB), 1,0) | |
r <- mean(tmp$capture) | |
capture <- c(capture,r) | |
print(cbind("Iteration:", i)) | |
} | |
summary(capture) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment