Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created February 3, 2024 18:35
Show Gist options
  • Save BioSciEconomist/e60aacc6b61e5ef7bafabf5ab484d83b to your computer and use it in GitHub Desktop.
Save BioSciEconomist/e60aacc6b61e5ef7bafabf5ab484d83b to your computer and use it in GitHub Desktop.
power analysis for logistic regression
# *-----------------------------------------------------------------
# | PROGRAM NAME: power simulation logistic.R
# | DATE: 2/3/24
# | CREATED BY: MATT BOGARD
# | PROJECT FILE: R Training
# *----------------------------------------------------------------
# | PURPOSE: power calculations and simulations
# *----------------------------------------------------------------
rm(list=ls()) # get rid of any existing data
cat("\f") # clear console
dev.off() # clear plots
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
library(dplyr)
library(Hmisc)
#----------------------------------------------------
# simulate data for logistic regression
#---------------------------------------------------
# based on: https://library.virginia.edu/data/articles/simulating-a-logistic-regression-model
# simulate population
# use this to tweak simulation parameters to get effect sizes and baseline probabilities
b0 <- -9 # intercept
b1 <- .25 # effect for x1
b2 <- .2 # effect for x2
xval1 <- 1 # baseline value for binary predictor
xval2 <- 30 # average value for continuous predictor
xb <- (b0 + b1*(xval1) +.2*xval2)
(1/(1 + exp(-xb))) # baseline probability
# simulation
set.seed(1)
gender <- sample(c(0,1), size = 1000000, replace = TRUE)
age <- round(rnorm(1000000, 30, 3))
xb <- -9 + .25*gender + 0.2*age
p <- 1/(1 + exp(-xb))
y <- rbinom(n = 1000000, size = 1, prob = p)
dat <- data.frame(y,gender,age)
mod <- glm(y ~ gender + age, family = "binomial", data = dat)
summary(mod)
exp(.25) # OR for gender 1.28
exp(.20) # OR for age 1.22
summary(dat)
#-----------------------------------------
# power calculation using 'pwrss' package
#-----------------------------------------
# https://cran.r-project.org/web/packages/pwrss/vignettes/examples.html#4_Logistic_Regression_(Wald’s_z_Test)
# see also: https://pwrss.shinyapps.io/index/
# Bulus, M. (2023). pwrss: Statistical Power and Sample Size Calculation Tools. R package version 0.3.1. https://CRAN.R-project.org/package=pwrss
# install.packages("pwrss")
library(pwrss)
#
# power calc and simulation for gender
#
summary(dat)
# p0 = baseline probability
# odds.ratio = effect we are trying to detect. Cohen (1988) 1.44 (very small) 2.88 (small) 4.27 (large)
# Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Ed). Routledge.
# r2.0ther captures correlation structure between other variables in model
# Cohen (1988) .02 (very small), .13 (small), .26 (large)
# dist = distribution of the variable 'x' for the effect you are estimating
#
# conservative settings - detecting OR for age
pwrss.z.logreg(p0 = .06, odds.ratio = 1.28, r2.other.x = .25,
power = 0.80, alpha = 0.05,
dist = "binomial")$n
# indicates we need around 10,000 observations
set.seed(9876) # make this repeatable 1234567
Sn = 10000
pval <- c()
repNum <- c()
for (i in 1:500) {
sample <- dat[sample(nrow(dat), Sn, replace = T), ] # sample from treatment group
summ <- summary(glm(y ~ gender + age, family = "binomial", data = sample))
p <- coef(summ)[11] # get pval for effect you are interested in
pval<- c(pval, p)
repNum <- c(repNum,i)
print(cbind("Iteration:", i)) # comment this out to stop printing to log
}
rpt <- data.frame(repNum,pval)
rm(i,summ,p,pval,repNum) # cleanup
sum(rpt$pval<.05)/nrow(rpt)
#
# power calc and simulation for age
#
summary(dat)
# p0 = baseline probability
# odds.ratio = effect we are trying to detect. Cohen (1988) 1.44 (very small) 2.88 (small) 4.27 (large)
# Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Ed). Routledge.
# r2.0ther captures correlation structure between other variables in model
# Cohen (1988) .02 (very small), .13 (small), .26 (large)
# dist = distribution of the variable 'x' for the effect you are estimating
#
# conservative settings - detecting OR for age
pwrss.z.logreg(p0 = .06, odds.ratio = 1.22, r2.other.x = .25,
power = 0.80, alpha = 0.05,
dist = "normal")$n
# indicates we need around 5000 observations
set.seed(9876) # make this repeatable 1234567
Sn = 1000
pval <- c()
repNum <- c()
for (i in 1:500) {
sample <- dat[sample(nrow(dat), Sn, replace = T), ] # sample from treatment group
summ <- summary(glm(y ~ gender + age, family = "binomial", data = sample))
p <- coef(summ)[12] # get pval for effect you are interested in
pval<- c(pval, p)
repNum <- c(repNum,i)
print(cbind("Iteration:", i)) # comment this out to stop printing to log
}
rpt <- data.frame(repNum,pval)
rm(i,summ,p,pval,repNum) # cleanup
sum(rpt$pval<.05)/nrow(rpt)
# the power calculation seemed very conservative compared to the simulation
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment