BioSciEconomist/CI Simulation.r

## CI Simulation.r
# *-----------------------------------------------------------------
# | PROGRAM NAME: CI Simulation
# | DATE: 7/16/22
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: How much confidence can we place in the range of values in a CI vs the point estimate
# *----------------------------------------------------------------

rm(list=ls()) # get rid of any existing data
cat("\f") # clear console
dev.off() # clear plots
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation


library(MASS)


# ----------------------------------------------
#
#
# create super population
#
#
# ----------------------------------------------


#
# treatent group
#

mu <- 3500  # mean 4000
sd <- 10500 # standard deviation 3 x mean
cv <- sd/mu # CV
kappa <- 1/(cv^2)  # shape parameter
theta <-  (sd*sd)/mu  # this is theta or scale parameter


# simulate costs based on parameter assumptions above
set.seed(1234567)
cost <- rgamma(100000, shape = kappa, scale = theta)

# check descriptive results
summary(cost)
sd(cost)
hist(cost, breaks = 50)

# simulate ID and treatment indicator
ID <- seq(1,100000)
D <- rep(1,100000)

trt <- data.frame(ID,cost,D) # combine metrics

#
# control group
#

mu <- 4000 # mean 6000
sd <- 12000 # standard deviation 3 x mean
cv <- sd/mu # CV
kappa <- 1/(cv^2)  # shape parameter
theta <-  (sd*sd)/mu  # this is theta or scale parameter

# simulate costs based on parameter assumptions above
set.seed(1234567)
cost <- rgamma(100000, shape = kappa, scale = theta)

# check descriptive results
summary(cost)
sd(cost)
hist(cost, breaks = 50)

# simulate ID and treatment indicator
ID <- seq(100001,200000)
D <- rep(0,100000)

ctrl <- data.frame(ID,cost,D) # combine metrics

#
# create population data frame
#

df <- rbind(trt,ctrl)

# check descriptives
summary(df)
df%>%
  group_by(D) %>%
  summarize(y = mean(cost),
            n = n())
hist(df$cost, breaks = 50)

summary(lm(cost ~ D, data = df)) # regression on full model

simdatA <- df # we will use this simulated data below

#
# what sample size do we need to measure this effect with 80% power and 95% confidence
#

library(pwr)

# calculate effect size 'd'

sigmaA = sd(simdatA$cost[simdatA$D ==1]) # standard deviation for group A
sigmaB = sd(simdatA$cost[simdatA$D ==0])  # standard deviation for group B (for now assume common variance between groups)
Q0 = .5 # proportion in the control group (.5 implies equal sized groups)
sigma_pooled = sqrt(((sigmaA)^2*(1/Q0 -1 ) + (sigmaB)^2*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation


# how large of a sample for $500
d = -500/sigma_pooled

pwr.t.test(n = , d =d , sig.level = .05 , power = .8 ,type = c("two.sample"))

# how much power do we have with varying sample sizes

# pwr.t.test(n = 1000, d =d , sig.level = .05 , power =  ,type = c("two.sample"))

# n = 8000 80%
# n = 7000 74%
# n = 6000 68%
# n = 5000 60%
# n = 4000 50%
# n = 3000 40%
# n = 2000 30%
# n = 1000 15%

# -----------------------------------------
# CI simulation for scenario 1
# -----------------------------------------

Sn <- 8000  # 8000  fully powered sample


# create treatment and control groups
treat <- simdatA[simdatA$D == 1,]
ctrl <- simdatA[simdatA$D == 0,]


set.seed(9876) # make this repeatable 1234567
pvals <-c()
bhat <- c()
CI_Lower <- c()
CI_Upper <- c()
repNum <- c()
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:100) {
  sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
  sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ]  # sample from control group
  bs <- rbind(sample1,sample2) # sample
  (summ <- summary(lm(cost ~ D, data = bs))) # our estimator
  se <- coef(summ)[4] # SE
  est <- coef(summ)[2] # est
  pval <- coef(summ)[8] # pvalue
  LB <- est - 1.96*se
  UB <- est + 1.96*se
  bhat <- c(bhat,est)
  pvals <- c(pvals,pval)
  CI_Lower <- c(CI_Lower, LB)
  CI_Upper <- c(CI_Upper, UB)
  repNum <- c(repNum,i) # get p-value from this bs sample
  print(cbind("Iteration:", i)) # comment this out to stop printing to log
}

rpt <- data.frame(repNum,bhat,pvals,CI_Lower,CI_Upper)

rm(LB,UB,se,est,i,sample1,sample2,bs,summ,bhat) # cleanup


k <- -500 # population parameter we are estimating


rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k'
rpt$sig <- ifelse(rpt$pvals < .05,1,0 ) # estimate is statistically different from 0
rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound
rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound
rpt$diffEst <- abs(k - rpt$bhat) # difference between point estimate and k
rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB  & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k
summary(rpt) # check


plot(rpt$bhat, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))),
     xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1)
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1)
abline(h = -500)

# order by estimate
tmp <- rpt[order(rpt$bhat),]
plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
     xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
abline(h = -500)

# separate intervals where estimate is closer to k than endpoints
tmp <- tmp[order(tmp$est_closer),]
plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
     xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
abline(h = -500)


capture <- c()
tmp <- rpt
for (i in 1:100) {
  chk <- rpt[rpt$repNum ==i,] # select CI

  LB <-chk$CI_Lower
  UB <- chk$CI_Upper
  tmp$capture <- ifelse((tmp$bhat > LB & tmp$bhat < UB), 1,0)
  r <- mean(tmp$capture)
  capture <- c(capture,r)
  print(cbind("Iteration:", i))
}

summary(capture)
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: CI Simulation
	# \| DATE: 7/16/22
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: How much confidence can we place in the range of values in a CI vs the point estimate
	# *----------------------------------------------------------------

	rm(list=ls()) # get rid of any existing data
	cat("\f") # clear console
	dev.off() # clear plots
	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation


	library(MASS)


	# ----------------------------------------------
	#
	#
	# create super population
	#
	#
	# ----------------------------------------------



	#
	# treatent group
	#

	mu <- 3500 # mean 4000
	sd <- 10500 # standard deviation 3 x mean
	cv <- sd/mu # CV
	kappa <- 1/(cv^2) # shape parameter
	theta <- (sd*sd)/mu # this is theta or scale parameter


	# simulate costs based on parameter assumptions above
	set.seed(1234567)
	cost <- rgamma(100000, shape = kappa, scale = theta)

	# check descriptive results
	summary(cost)
	sd(cost)
	hist(cost, breaks = 50)

	# simulate ID and treatment indicator
	ID <- seq(1,100000)
	D <- rep(1,100000)

	trt <- data.frame(ID,cost,D) # combine metrics

	#
	# control group
	#

	mu <- 4000 # mean 6000
	sd <- 12000 # standard deviation 3 x mean
	cv <- sd/mu # CV
	kappa <- 1/(cv^2) # shape parameter
	theta <- (sd*sd)/mu # this is theta or scale parameter

	# simulate costs based on parameter assumptions above
	set.seed(1234567)
	cost <- rgamma(100000, shape = kappa, scale = theta)

	# check descriptive results
	summary(cost)
	sd(cost)
	hist(cost, breaks = 50)

	# simulate ID and treatment indicator
	ID <- seq(100001,200000)
	D <- rep(0,100000)

	ctrl <- data.frame(ID,cost,D) # combine metrics

	#
	# create population data frame
	#

	df <- rbind(trt,ctrl)

	# check descriptives
	summary(df)
	df%>%
	group_by(D) %>%
	summarize(y = mean(cost),
	n = n())
	hist(df$cost, breaks = 50)

	summary(lm(cost ~ D, data = df)) # regression on full model

	simdatA <- df # we will use this simulated data below

	#
	# what sample size do we need to measure this effect with 80% power and 95% confidence
	#

	library(pwr)

	# calculate effect size 'd'

	sigmaA = sd(simdatA$cost[simdatA$D ==1]) # standard deviation for group A
	sigmaB = sd(simdatA$cost[simdatA$D ==0]) # standard deviation for group B (for now assume common variance between groups)
	Q0 = .5 # proportion in the control group (.5 implies equal sized groups)
	sigma_pooled = sqrt(((sigmaA)^2(1/Q0 -1 ) + (sigmaB)^2(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation


	# how large of a sample for $500
	d = -500/sigma_pooled

	pwr.t.test(n = , d =d , sig.level = .05 , power = .8 ,type = c("two.sample"))

	# how much power do we have with varying sample sizes

	# pwr.t.test(n = 1000, d =d , sig.level = .05 , power = ,type = c("two.sample"))

	# n = 8000 80%
	# n = 7000 74%
	# n = 6000 68%
	# n = 5000 60%
	# n = 4000 50%
	# n = 3000 40%
	# n = 2000 30%
	# n = 1000 15%

	# -----------------------------------------
	# CI simulation for scenario 1
	# -----------------------------------------

	Sn <- 8000 # 8000 fully powered sample


	# create treatment and control groups
	treat <- simdatA[simdatA$D == 1,]
	ctrl <- simdatA[simdatA$D == 0,]


	set.seed(9876) # make this repeatable 1234567
	pvals <-c()
	bhat <- c()
	CI_Lower <- c()
	CI_Upper <- c()
	repNum <- c()
	bstrap <- c() # initialize vector for bootstrapped estimates
	for (i in 1:100) {
	sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
	sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
	bs <- rbind(sample1,sample2) # sample
	(summ <- summary(lm(cost ~ D, data = bs))) # our estimator
	se <- coef(summ)[4] # SE
	est <- coef(summ)[2] # est
	pval <- coef(summ)[8] # pvalue
	LB <- est - 1.96*se
	UB <- est + 1.96*se
	bhat <- c(bhat,est)
	pvals <- c(pvals,pval)
	CI_Lower <- c(CI_Lower, LB)
	CI_Upper <- c(CI_Upper, UB)
	repNum <- c(repNum,i) # get p-value from this bs sample
	print(cbind("Iteration:", i)) # comment this out to stop printing to log
	}

	rpt <- data.frame(repNum,bhat,pvals,CI_Lower,CI_Upper)

	rm(LB,UB,se,est,i,sample1,sample2,bs,summ,bhat) # cleanup


	k <- -500 # population parameter we are estimating



	rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k'
	rpt$sig <- ifelse(rpt$pvals < .05,1,0 ) # estimate is statistically different from 0
	rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound
	rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound
	rpt$diffEst <- abs(k - rpt$bhat) # difference between point estimate and k
	rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k
	summary(rpt) # check



	plot(rpt$bhat, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))),
	xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
	segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1)
	segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1)
	abline(h = -500)

	# order by estimate
	tmp <- rpt[order(rpt$bhat),]
	plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
	xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
	segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
	segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
	abline(h = -500)

	# separate intervals where estimate is closer to k than endpoints
	tmp <- tmp[order(tmp$est_closer),]
	plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
	xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
	segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
	segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
	abline(h = -500)


	capture <- c()
	tmp <- rpt
	for (i in 1:100) {
	chk <- rpt[rpt$repNum ==i,] # select CI

	LB <-chk$CI_Lower
	UB <- chk$CI_Upper
	tmp$capture <- ifelse((tmp$bhat > LB & tmp$bhat < UB), 1,0)
	r <- mean(tmp$capture)
	capture <- c(capture,r)
	print(cbind("Iteration:", i))
	}

	summary(capture)