Created
July 19, 2022 20:36
-
-
Save BioSciEconomist/7ccf2f40cb5140dd6d2977a4fcc10e3b to your computer and use it in GitHub Desktop.
Explore the magnitude and sign of errors when we are underpowered
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#*----------------------------------------------------------------- | |
# | PROGRAM NAME: CI and TypeMS Errors.r | |
# | DATE: 7/19/22 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: Explore the magnitude and sign of errors when we are underpowered | |
# *---------------------------------------------------------------- | |
rm(list=ls()) # get rid of any existing data | |
cat("\f") # clear console | |
dev.off() # clear plots | |
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
library(MASS) | |
# ---------------------------------------------- | |
# | |
# | |
# create super population | |
# | |
# | |
# ---------------------------------------------- | |
# | |
# treatent group | |
# | |
mu <- 3500 # mean 4000 | |
sd <- 10500 # standard deviation 3 x mean | |
cv <- sd/mu # CV | |
kappa <- 1/(cv^2) # shape parameter | |
theta <- (sd*sd)/mu # this is theta or scale parameter | |
# simulate costs based on parameter assumptions above | |
set.seed(1234567) | |
cost <- rgamma(100000, shape = kappa, scale = theta) | |
# check descriptive results | |
summary(cost) | |
sd(cost) | |
hist(cost, breaks = 50) | |
# simulate ID and treatment indicator | |
ID <- seq(1,100000) | |
D <- rep(1,100000) | |
trt <- data.frame(ID,cost,D) # combine metrics | |
# | |
# control group | |
# | |
mu <- 4000 # mean 6000 | |
sd <- 12000 # standard deviation 3 x mean | |
cv <- sd/mu # CV | |
kappa <- 1/(cv^2) # shape parameter | |
theta <- (sd*sd)/mu # this is theta or scale parameter | |
# simulate costs based on parameter assumptions above | |
set.seed(1234567) | |
cost <- rgamma(100000, shape = kappa, scale = theta) | |
# check descriptive results | |
summary(cost) | |
sd(cost) | |
hist(cost, breaks = 50) | |
# simulate ID and treatment indicator | |
ID <- seq(100001,200000) | |
D <- rep(0,100000) | |
ctrl <- data.frame(ID,cost,D) # combine metrics | |
# | |
# create population data frame | |
# | |
df <- rbind(trt,ctrl) | |
# check descriptives | |
summary(df) | |
df%>% | |
group_by(D) %>% | |
summarize(y = mean(cost), | |
n = n()) | |
hist(df$cost, breaks = 50) | |
summary(lm(cost ~ D, data = df)) # regression on full model | |
simdatA <- df # we will use this simulated data below | |
# | |
# what sample size do we need to measure this effect with 80% power and 95% confidence | |
# | |
library(pwr) | |
# calculate effect size 'd' | |
sigmaA = sd(simdatA$cost[simdatA$D ==1]) # standard deviation for group A | |
sigmaB = sd(simdatA$cost[simdatA$D ==0]) # standard deviation for group B (for now assume common variance between groups) | |
Q0 = .5 # proportion in the control group (.5 implies equal sized groups) | |
sigma_pooled = sqrt(((sigmaA)^2*(1/Q0 -1 ) + (sigmaB)^2*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation | |
# how large of a sample for $500 | |
d = -500/sigma_pooled | |
pwr.t.test(n = , d =d , sig.level = .05 , power = .8 ,type = c("two.sample")) | |
# how much power do we have with varying sample sizes | |
# pwr.t.test(n = 1000, d =d , sig.level = .05 , power = ,type = c("two.sample")) | |
# n = 8000 80% | |
# n = 7000 74% | |
# n = 6000 68% | |
# n = 5000 60% | |
# n = 4000 50% | |
# n = 3000 40% | |
# n = 2000 30% | |
# n = 1000 15% | |
# ----------------------------------------- | |
# CI simulation for underpowered sample | |
# ----------------------------------------- | |
Sn <- 1000 # 8000 fully powered sample | |
# create treatment and control groups | |
treat <- simdatA[simdatA$D == 1,] | |
ctrl <- simdatA[simdatA$D == 0,] | |
set.seed(9876) # make this repeatable 1234567 | |
pvals <-c() | |
bhat <- c() | |
stderr <- c() | |
CI_Lower <- c() | |
CI_Upper <- c() | |
repNum <- c() | |
bstrap <- c() # initialize vector for bootstrapped estimates | |
for (i in 1:100) { | |
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group | |
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group | |
bs <- rbind(sample1,sample2) # sample | |
(summ <- summary(lm(cost ~ D, data = bs))) # our estimator | |
se <- coef(summ)[4] # SE | |
est <- coef(summ)[2] # est | |
pval <- coef(summ)[8] # pvalue | |
LB <- est - 1.96*se | |
UB <- est + 1.96*se | |
bhat <- c(bhat,est) | |
stderr <- c(stderr,se) | |
pvals <- c(pvals,pval) | |
CI_Lower <- c(CI_Lower, LB) | |
CI_Upper <- c(CI_Upper, UB) | |
repNum <- c(repNum,i) # get p-value from this bs sample | |
print(cbind("Iteration:", i)) # comment this out to stop printing to log | |
} | |
rpt <- data.frame(repNum,bhat,stderr,pvals,CI_Lower,CI_Upper) | |
rm(LB,UB,se,est,i,sample1,sample2,bs,summ,bhat) # cleanup | |
k <- -500 # population parameter we are estimating | |
# report the simulation results | |
rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k' | |
rpt$sig <- ifelse(rpt$pvals < .05,1,0 ) # estimate is statistically different from 0 | |
rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound | |
rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound | |
rpt$diffEst <- abs(k - rpt$bhat) # difference between point estimate and k | |
rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k | |
summary(rpt) # check | |
plot(rpt$bhat, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))), | |
xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n') | |
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1) | |
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1) | |
abline(h = 0 | |
# order by estimate | |
tmp <- rpt[order(rpt$bhat),] | |
plot(tmp$bhat, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))), | |
xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n') | |
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1) | |
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1) | |
abline(h = 0) | |
# function from UVA | |
TypeMS <- function(A, s, alpha=.05, df=Inf, n.sims=10000){ | |
z <- qt(1-alpha/2, df) | |
p.hi <- 1 - pt(z-A/s, df) | |
p.lo <- pt(-z-A/s, df) | |
power <- p.hi + p.lo | |
typeS <- p.lo/power | |
estimate <- A + s*rt(n.sims,df) | |
significant <- abs(estimate) > s*z | |
exaggeration <- mean(abs(estimate)[significant])/A | |
return(list(power=power, typeS=typeS, exaggeration=exaggeration)) | |
} | |
# A = true effect size measured in actual units of outcome (population) | |
# s = standard error of the estimate | |
# alpha = level of significance | |
# df = degrees of freedom, infinite if normal distribution | |
TypeMS(A = 500,500,.05) | |
# we can verify these results by looking at our simulated data | |
# flag sign and magnitude errors | |
rpt$sign <- ifelse(rpt$bhat > 0,1,0) # sign error | |
rpt$magnitude <- abs(rpt$bhat)/500 | |
# look at significant results only | |
summary(rpt[rpt$sig ==1,]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment