BioSciEconomist/CI and TypeMS Errors.r

## CI and TypeMS Errors.r
  #*-----------------------------------------------------------------
  # | PROGRAM NAME: CI and TypeMS Errors.r
  # | DATE: 7/19/22
  # | CREATED BY: MATT BOGARD
  # | PROJECT FILE:
  # *----------------------------------------------------------------
  # | PURPOSE: Explore the magnitude and sign of errors when we are underpowered
  # *----------------------------------------------------------------

rm(list=ls()) # get rid of any existing data
cat("\f") # clear console
dev.off() # clear plots
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation


library(MASS)


# ----------------------------------------------
#
#
# create super population
#
#
# ----------------------------------------------


#
# treatent group
#

mu <- 3500  # mean 4000
sd <- 10500 # standard deviation 3 x mean
cv <- sd/mu # CV
kappa <- 1/(cv^2)  # shape parameter
theta <-  (sd*sd)/mu  # this is theta or scale parameter


# simulate costs based on parameter assumptions above
set.seed(1234567)
cost <- rgamma(100000, shape = kappa, scale = theta)

# check descriptive results
summary(cost)
sd(cost)
hist(cost, breaks = 50)

# simulate ID and treatment indicator
ID <- seq(1,100000)
D <- rep(1,100000)

trt <- data.frame(ID,cost,D) # combine metrics

#
# control group
#

mu <- 4000 # mean 6000
sd <- 12000 # standard deviation 3 x mean
cv <- sd/mu # CV
kappa <- 1/(cv^2)  # shape parameter
theta <-  (sd*sd)/mu  # this is theta or scale parameter

# simulate costs based on parameter assumptions above
set.seed(1234567)
cost <- rgamma(100000, shape = kappa, scale = theta)

# check descriptive results
summary(cost)
sd(cost)
hist(cost, breaks = 50)

# simulate ID and treatment indicator
ID <- seq(100001,200000)
D <- rep(0,100000)

ctrl <- data.frame(ID,cost,D) # combine metrics

#
# create population data frame
#

df <- rbind(trt,ctrl)

# check descriptives
summary(df)
df%>%
  group_by(D) %>%
  summarize(y = mean(cost),
            n = n())
hist(df$cost, breaks = 50)

summary(lm(cost ~ D, data = df)) # regression on full model

simdatA <- df # we will use this simulated data below

#
# what sample size do we need to measure this effect with 80% power and 95% confidence
#

library(pwr)

# calculate effect size 'd'

sigmaA = sd(simdatA$cost[simdatA$D ==1]) # standard deviation for group A
sigmaB = sd(simdatA$cost[simdatA$D ==0])  # standard deviation for group B (for now assume common variance between groups)
Q0 = .5 # proportion in the control group (.5 implies equal sized groups)
sigma_pooled = sqrt(((sigmaA)^2*(1/Q0 -1 ) + (sigmaB)^2*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation


# how large of a sample for $500
d = -500/sigma_pooled

pwr.t.test(n = , d =d , sig.level = .05 , power = .8 ,type = c("two.sample"))

# how much power do we have with varying sample sizes

# pwr.t.test(n = 1000, d =d , sig.level = .05 , power =  ,type = c("two.sample"))

# n = 8000 80%
# n = 7000 74%
# n = 6000 68%
# n = 5000 60%
# n = 4000 50%
# n = 3000 40%
# n = 2000 30%
# n = 1000 15%

# -----------------------------------------
# CI simulation for underpowered sample
# -----------------------------------------

Sn <- 1000  # 8000  fully powered sample


# create treatment and control groups
treat <- simdatA[simdatA$D == 1,]
ctrl <- simdatA[simdatA$D == 0,]


set.seed(9876) # make this repeatable 1234567
pvals <-c()
bhat <- c()
stderr <- c()
CI_Lower <- c()
CI_Upper <- c()
repNum <- c()
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:100) {
  sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
  sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ]  # sample from control group
  bs <- rbind(sample1,sample2) # sample
  (summ <- summary(lm(cost ~ D, data = bs))) # our estimator
  se <- coef(summ)[4] # SE
  est <- coef(summ)[2] # est
  pval <- coef(summ)[8] # pvalue
  LB <- est - 1.96*se
  UB <- est + 1.96*se
  bhat <- c(bhat,est)
  stderr <- c(stderr,se)
  pvals <- c(pvals,pval)
  CI_Lower <- c(CI_Lower, LB)
  CI_Upper <- c(CI_Upper, UB)
  repNum <- c(repNum,i) # get p-value from this bs sample
  print(cbind("Iteration:", i)) # comment this out to stop printing to log
}


rpt <- data.frame(repNum,bhat,stderr,pvals,CI_Lower,CI_Upper)

rm(LB,UB,se,est,i,sample1,sample2,bs,summ,bhat) # cleanup


k <- -500 # population parameter we are estimating


# report the simulation results
rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k'
rpt$sig <- ifelse(rpt$pvals < .05,1,0 ) # estimate is statistically different from 0
rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound
rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound
rpt$diffEst <- abs(k - rpt$bhat) # difference between point estimate and k
rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB  & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k
summary(rpt) # check


plot(rpt$bhat, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))),
     xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1)
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1)
abline(h = 0

# order by estimate
tmp <- rpt[order(rpt$bhat),]
plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
     xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
abline(h = 0)


# function from UVA
TypeMS <- function(A, s, alpha=.05, df=Inf, n.sims=10000){
  z <- qt(1-alpha/2, df)
  p.hi <- 1 - pt(z-A/s, df)
  p.lo <- pt(-z-A/s, df)
  power <- p.hi + p.lo
  typeS <- p.lo/power
  estimate <- A + s*rt(n.sims,df)
  significant <- abs(estimate) > s*z
  exaggeration <- mean(abs(estimate)[significant])/A
  return(list(power=power, typeS=typeS, exaggeration=exaggeration))
}

# A = true effect size measured in actual units of outcome (population)
# s = standard error of the estimate
# alpha = level of significance
# df = degrees of freedom, infinite if normal distribution


TypeMS(A = 500,500,.05)

# we can verify these results by looking at our simulated data

# flag sign and magnitude errors
rpt$sign <- ifelse(rpt$bhat > 0,1,0) # sign error
rpt$magnitude <- abs(rpt$bhat)/500

# look at significant results only
summary(rpt[rpt$sig ==1,])
	#*-----------------------------------------------------------------
	# \| PROGRAM NAME: CI and TypeMS Errors.r
	# \| DATE: 7/19/22
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: Explore the magnitude and sign of errors when we are underpowered
	# *----------------------------------------------------------------

	rm(list=ls()) # get rid of any existing data
	cat("\f") # clear console
	dev.off() # clear plots
	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation


	library(MASS)


	# ----------------------------------------------
	#
	#
	# create super population
	#
	#
	# ----------------------------------------------



	#
	# treatent group
	#

	mu <- 3500 # mean 4000
	sd <- 10500 # standard deviation 3 x mean
	cv <- sd/mu # CV
	kappa <- 1/(cv^2) # shape parameter
	theta <- (sd*sd)/mu # this is theta or scale parameter


	# simulate costs based on parameter assumptions above
	set.seed(1234567)
	cost <- rgamma(100000, shape = kappa, scale = theta)

	# check descriptive results
	summary(cost)
	sd(cost)
	hist(cost, breaks = 50)

	# simulate ID and treatment indicator
	ID <- seq(1,100000)
	D <- rep(1,100000)

	trt <- data.frame(ID,cost,D) # combine metrics

	#
	# control group
	#

	mu <- 4000 # mean 6000
	sd <- 12000 # standard deviation 3 x mean
	cv <- sd/mu # CV
	kappa <- 1/(cv^2) # shape parameter
	theta <- (sd*sd)/mu # this is theta or scale parameter

	# simulate costs based on parameter assumptions above
	set.seed(1234567)
	cost <- rgamma(100000, shape = kappa, scale = theta)

	# check descriptive results
	summary(cost)
	sd(cost)
	hist(cost, breaks = 50)

	# simulate ID and treatment indicator
	ID <- seq(100001,200000)
	D <- rep(0,100000)

	ctrl <- data.frame(ID,cost,D) # combine metrics

	#
	# create population data frame
	#

	df <- rbind(trt,ctrl)

	# check descriptives
	summary(df)
	df%>%
	group_by(D) %>%
	summarize(y = mean(cost),
	n = n())
	hist(df$cost, breaks = 50)

	summary(lm(cost ~ D, data = df)) # regression on full model

	simdatA <- df # we will use this simulated data below

	#
	# what sample size do we need to measure this effect with 80% power and 95% confidence
	#

	library(pwr)

	# calculate effect size 'd'

	sigmaA = sd(simdatA$cost[simdatA$D ==1]) # standard deviation for group A
	sigmaB = sd(simdatA$cost[simdatA$D ==0]) # standard deviation for group B (for now assume common variance between groups)
	Q0 = .5 # proportion in the control group (.5 implies equal sized groups)
	sigma_pooled = sqrt(((sigmaA)^2(1/Q0 -1 ) + (sigmaB)^2(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation


	# how large of a sample for $500
	d = -500/sigma_pooled

	pwr.t.test(n = , d =d , sig.level = .05 , power = .8 ,type = c("two.sample"))

	# how much power do we have with varying sample sizes

	# pwr.t.test(n = 1000, d =d , sig.level = .05 , power = ,type = c("two.sample"))

	# n = 8000 80%
	# n = 7000 74%
	# n = 6000 68%
	# n = 5000 60%
	# n = 4000 50%
	# n = 3000 40%
	# n = 2000 30%
	# n = 1000 15%

	# -----------------------------------------
	# CI simulation for underpowered sample
	# -----------------------------------------

	Sn <- 1000 # 8000 fully powered sample


	# create treatment and control groups
	treat <- simdatA[simdatA$D == 1,]
	ctrl <- simdatA[simdatA$D == 0,]


	set.seed(9876) # make this repeatable 1234567
	pvals <-c()
	bhat <- c()
	stderr <- c()
	CI_Lower <- c()
	CI_Upper <- c()
	repNum <- c()
	bstrap <- c() # initialize vector for bootstrapped estimates
	for (i in 1:100) {
	sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
	sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
	bs <- rbind(sample1,sample2) # sample
	(summ <- summary(lm(cost ~ D, data = bs))) # our estimator
	se <- coef(summ)[4] # SE
	est <- coef(summ)[2] # est
	pval <- coef(summ)[8] # pvalue
	LB <- est - 1.96*se
	UB <- est + 1.96*se
	bhat <- c(bhat,est)
	stderr <- c(stderr,se)
	pvals <- c(pvals,pval)
	CI_Lower <- c(CI_Lower, LB)
	CI_Upper <- c(CI_Upper, UB)
	repNum <- c(repNum,i) # get p-value from this bs sample
	print(cbind("Iteration:", i)) # comment this out to stop printing to log
	}


	rpt <- data.frame(repNum,bhat,stderr,pvals,CI_Lower,CI_Upper)

	rm(LB,UB,se,est,i,sample1,sample2,bs,summ,bhat) # cleanup


	k <- -500 # population parameter we are estimating


	# report the simulation results
	rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k'
	rpt$sig <- ifelse(rpt$pvals < .05,1,0 ) # estimate is statistically different from 0
	rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound
	rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound
	rpt$diffEst <- abs(k - rpt$bhat) # difference between point estimate and k
	rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k
	summary(rpt) # check



	plot(rpt$bhat, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))),
	xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
	segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1)
	segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1)
	abline(h = 0

	# order by estimate
	tmp <- rpt[order(rpt$bhat),]
	plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
	xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
	segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
	segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
	abline(h = 0)


	# function from UVA
	TypeMS <- function(A, s, alpha=.05, df=Inf, n.sims=10000){
	z <- qt(1-alpha/2, df)
	p.hi <- 1 - pt(z-A/s, df)
	p.lo <- pt(-z-A/s, df)
	power <- p.hi + p.lo
	typeS <- p.lo/power
	estimate <- A + s*rt(n.sims,df)
	significant <- abs(estimate) > s*z
	exaggeration <- mean(abs(estimate)[significant])/A
	return(list(power=power, typeS=typeS, exaggeration=exaggeration))
	}

	# A = true effect size measured in actual units of outcome (population)
	# s = standard error of the estimate
	# alpha = level of significance
	# df = degrees of freedom, infinite if normal distribution


	TypeMS(A = 500,500,.05)

	# we can verify these results by looking at our simulated data

	# flag sign and magnitude errors
	rpt$sign <- ifelse(rpt$bhat > 0,1,0) # sign error
	rpt$magnitude <- abs(rpt$bhat)/500

	# look at significant results only
	summary(rpt[rpt$sig ==1,])