Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created July 19, 2022 20:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BioSciEconomist/7ccf2f40cb5140dd6d2977a4fcc10e3b to your computer and use it in GitHub Desktop.
Save BioSciEconomist/7ccf2f40cb5140dd6d2977a4fcc10e3b to your computer and use it in GitHub Desktop.
Explore the magnitude and sign of errors when we are underpowered
#*-----------------------------------------------------------------
# | PROGRAM NAME: CI and TypeMS Errors.r
# | DATE: 7/19/22
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: Explore the magnitude and sign of errors when we are underpowered
# *----------------------------------------------------------------
rm(list=ls()) # get rid of any existing data
cat("\f") # clear console
dev.off() # clear plots
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
library(MASS)
# ----------------------------------------------
#
#
# create super population
#
#
# ----------------------------------------------
#
# treatent group
#
mu <- 3500 # mean 4000
sd <- 10500 # standard deviation 3 x mean
cv <- sd/mu # CV
kappa <- 1/(cv^2) # shape parameter
theta <- (sd*sd)/mu # this is theta or scale parameter
# simulate costs based on parameter assumptions above
set.seed(1234567)
cost <- rgamma(100000, shape = kappa, scale = theta)
# check descriptive results
summary(cost)
sd(cost)
hist(cost, breaks = 50)
# simulate ID and treatment indicator
ID <- seq(1,100000)
D <- rep(1,100000)
trt <- data.frame(ID,cost,D) # combine metrics
#
# control group
#
mu <- 4000 # mean 6000
sd <- 12000 # standard deviation 3 x mean
cv <- sd/mu # CV
kappa <- 1/(cv^2) # shape parameter
theta <- (sd*sd)/mu # this is theta or scale parameter
# simulate costs based on parameter assumptions above
set.seed(1234567)
cost <- rgamma(100000, shape = kappa, scale = theta)
# check descriptive results
summary(cost)
sd(cost)
hist(cost, breaks = 50)
# simulate ID and treatment indicator
ID <- seq(100001,200000)
D <- rep(0,100000)
ctrl <- data.frame(ID,cost,D) # combine metrics
#
# create population data frame
#
df <- rbind(trt,ctrl)
# check descriptives
summary(df)
df%>%
group_by(D) %>%
summarize(y = mean(cost),
n = n())
hist(df$cost, breaks = 50)
summary(lm(cost ~ D, data = df)) # regression on full model
simdatA <- df # we will use this simulated data below
#
# what sample size do we need to measure this effect with 80% power and 95% confidence
#
library(pwr)
# calculate effect size 'd'
sigmaA = sd(simdatA$cost[simdatA$D ==1]) # standard deviation for group A
sigmaB = sd(simdatA$cost[simdatA$D ==0]) # standard deviation for group B (for now assume common variance between groups)
Q0 = .5 # proportion in the control group (.5 implies equal sized groups)
sigma_pooled = sqrt(((sigmaA)^2*(1/Q0 -1 ) + (sigmaB)^2*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation
# how large of a sample for $500
d = -500/sigma_pooled
pwr.t.test(n = , d =d , sig.level = .05 , power = .8 ,type = c("two.sample"))
# how much power do we have with varying sample sizes
# pwr.t.test(n = 1000, d =d , sig.level = .05 , power = ,type = c("two.sample"))
# n = 8000 80%
# n = 7000 74%
# n = 6000 68%
# n = 5000 60%
# n = 4000 50%
# n = 3000 40%
# n = 2000 30%
# n = 1000 15%
# -----------------------------------------
# CI simulation for underpowered sample
# -----------------------------------------
Sn <- 1000 # 8000 fully powered sample
# create treatment and control groups
treat <- simdatA[simdatA$D == 1,]
ctrl <- simdatA[simdatA$D == 0,]
set.seed(9876) # make this repeatable 1234567
pvals <-c()
bhat <- c()
stderr <- c()
CI_Lower <- c()
CI_Upper <- c()
repNum <- c()
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:100) {
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # sample
(summ <- summary(lm(cost ~ D, data = bs))) # our estimator
se <- coef(summ)[4] # SE
est <- coef(summ)[2] # est
pval <- coef(summ)[8] # pvalue
LB <- est - 1.96*se
UB <- est + 1.96*se
bhat <- c(bhat,est)
stderr <- c(stderr,se)
pvals <- c(pvals,pval)
CI_Lower <- c(CI_Lower, LB)
CI_Upper <- c(CI_Upper, UB)
repNum <- c(repNum,i) # get p-value from this bs sample
print(cbind("Iteration:", i)) # comment this out to stop printing to log
}
rpt <- data.frame(repNum,bhat,stderr,pvals,CI_Lower,CI_Upper)
rm(LB,UB,se,est,i,sample1,sample2,bs,summ,bhat) # cleanup
k <- -500 # population parameter we are estimating
# report the simulation results
rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k'
rpt$sig <- ifelse(rpt$pvals < .05,1,0 ) # estimate is statistically different from 0
rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound
rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound
rpt$diffEst <- abs(k - rpt$bhat) # difference between point estimate and k
rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k
summary(rpt) # check
plot(rpt$bhat, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))),
xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1)
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1)
abline(h = 0
# order by estimate
tmp <- rpt[order(rpt$bhat),]
plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
abline(h = 0)
# function from UVA
TypeMS <- function(A, s, alpha=.05, df=Inf, n.sims=10000){
z <- qt(1-alpha/2, df)
p.hi <- 1 - pt(z-A/s, df)
p.lo <- pt(-z-A/s, df)
power <- p.hi + p.lo
typeS <- p.lo/power
estimate <- A + s*rt(n.sims,df)
significant <- abs(estimate) > s*z
exaggeration <- mean(abs(estimate)[significant])/A
return(list(power=power, typeS=typeS, exaggeration=exaggeration))
}
# A = true effect size measured in actual units of outcome (population)
# s = standard error of the estimate
# alpha = level of significance
# df = degrees of freedom, infinite if normal distribution
TypeMS(A = 500,500,.05)
# we can verify these results by looking at our simulated data
# flag sign and magnitude errors
rpt$sign <- ifelse(rpt$bhat > 0,1,0) # sign error
rpt$magnitude <- abs(rpt$bhat)/500
# look at significant results only
summary(rpt[rpt$sig ==1,])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment