Created
February 3, 2024 18:35
-
-
Save BioSciEconomist/e60aacc6b61e5ef7bafabf5ab484d83b to your computer and use it in GitHub Desktop.
power analysis for logistic regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: power simulation logistic.R | |
# | DATE: 2/3/24 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: R Training | |
# *---------------------------------------------------------------- | |
# | PURPOSE: power calculations and simulations | |
# *---------------------------------------------------------------- | |
rm(list=ls()) # get rid of any existing data | |
cat("\f") # clear console | |
dev.off() # clear plots | |
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
library(dplyr) | |
library(Hmisc) | |
#---------------------------------------------------- | |
# simulate data for logistic regression | |
#--------------------------------------------------- | |
# based on: https://library.virginia.edu/data/articles/simulating-a-logistic-regression-model | |
# simulate population | |
# use this to tweak simulation parameters to get effect sizes and baseline probabilities | |
b0 <- -9 # intercept | |
b1 <- .25 # effect for x1 | |
b2 <- .2 # effect for x2 | |
xval1 <- 1 # baseline value for binary predictor | |
xval2 <- 30 # average value for continuous predictor | |
xb <- (b0 + b1*(xval1) +.2*xval2) | |
(1/(1 + exp(-xb))) # baseline probability | |
# simulation | |
set.seed(1) | |
gender <- sample(c(0,1), size = 1000000, replace = TRUE) | |
age <- round(rnorm(1000000, 30, 3)) | |
xb <- -9 + .25*gender + 0.2*age | |
p <- 1/(1 + exp(-xb)) | |
y <- rbinom(n = 1000000, size = 1, prob = p) | |
dat <- data.frame(y,gender,age) | |
mod <- glm(y ~ gender + age, family = "binomial", data = dat) | |
summary(mod) | |
exp(.25) # OR for gender 1.28 | |
exp(.20) # OR for age 1.22 | |
summary(dat) | |
#----------------------------------------- | |
# power calculation using 'pwrss' package | |
#----------------------------------------- | |
# https://cran.r-project.org/web/packages/pwrss/vignettes/examples.html#4_Logistic_Regression_(Wald’s_z_Test) | |
# see also: https://pwrss.shinyapps.io/index/ | |
# Bulus, M. (2023). pwrss: Statistical Power and Sample Size Calculation Tools. R package version 0.3.1. https://CRAN.R-project.org/package=pwrss | |
# install.packages("pwrss") | |
library(pwrss) | |
# | |
# power calc and simulation for gender | |
# | |
summary(dat) | |
# p0 = baseline probability | |
# odds.ratio = effect we are trying to detect. Cohen (1988) 1.44 (very small) 2.88 (small) 4.27 (large) | |
# Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Ed). Routledge. | |
# r2.0ther captures correlation structure between other variables in model | |
# Cohen (1988) .02 (very small), .13 (small), .26 (large) | |
# dist = distribution of the variable 'x' for the effect you are estimating | |
# | |
# conservative settings - detecting OR for age | |
pwrss.z.logreg(p0 = .06, odds.ratio = 1.28, r2.other.x = .25, | |
power = 0.80, alpha = 0.05, | |
dist = "binomial")$n | |
# indicates we need around 10,000 observations | |
set.seed(9876) # make this repeatable 1234567 | |
Sn = 10000 | |
pval <- c() | |
repNum <- c() | |
for (i in 1:500) { | |
sample <- dat[sample(nrow(dat), Sn, replace = T), ] # sample from treatment group | |
summ <- summary(glm(y ~ gender + age, family = "binomial", data = sample)) | |
p <- coef(summ)[11] # get pval for effect you are interested in | |
pval<- c(pval, p) | |
repNum <- c(repNum,i) | |
print(cbind("Iteration:", i)) # comment this out to stop printing to log | |
} | |
rpt <- data.frame(repNum,pval) | |
rm(i,summ,p,pval,repNum) # cleanup | |
sum(rpt$pval<.05)/nrow(rpt) | |
# | |
# power calc and simulation for age | |
# | |
summary(dat) | |
# p0 = baseline probability | |
# odds.ratio = effect we are trying to detect. Cohen (1988) 1.44 (very small) 2.88 (small) 4.27 (large) | |
# Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Ed). Routledge. | |
# r2.0ther captures correlation structure between other variables in model | |
# Cohen (1988) .02 (very small), .13 (small), .26 (large) | |
# dist = distribution of the variable 'x' for the effect you are estimating | |
# | |
# conservative settings - detecting OR for age | |
pwrss.z.logreg(p0 = .06, odds.ratio = 1.22, r2.other.x = .25, | |
power = 0.80, alpha = 0.05, | |
dist = "normal")$n | |
# indicates we need around 5000 observations | |
set.seed(9876) # make this repeatable 1234567 | |
Sn = 1000 | |
pval <- c() | |
repNum <- c() | |
for (i in 1:500) { | |
sample <- dat[sample(nrow(dat), Sn, replace = T), ] # sample from treatment group | |
summ <- summary(glm(y ~ gender + age, family = "binomial", data = sample)) | |
p <- coef(summ)[12] # get pval for effect you are interested in | |
pval<- c(pval, p) | |
repNum <- c(repNum,i) | |
print(cbind("Iteration:", i)) # comment this out to stop printing to log | |
} | |
rpt <- data.frame(repNum,pval) | |
rm(i,summ,p,pval,repNum) # cleanup | |
sum(rpt$pval<.05)/nrow(rpt) | |
# the power calculation seemed very conservative compared to the simulation |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment