BioSciEconomist/power simulation logistic.R

## power simulation logistic.R
# *-----------------------------------------------------------------
# | PROGRAM NAME: power simulation logistic.R
# | DATE: 2/3/24
# | CREATED BY: MATT BOGARD
# | PROJECT FILE: R Training
# *----------------------------------------------------------------
# | PURPOSE: power calculations and simulations
# *----------------------------------------------------------------


rm(list=ls()) # get rid of any existing data
cat("\f") # clear console
dev.off() # clear plots
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

library(dplyr)
library(Hmisc)


#----------------------------------------------------
# simulate data for logistic regression
#---------------------------------------------------

# based on:  https://library.virginia.edu/data/articles/simulating-a-logistic-regression-model

# simulate population

# use this to tweak simulation parameters to get effect sizes and baseline probabilities
b0 <- -9  # intercept
b1 <- .25 # effect for x1
b2 <- .2  # effect for x2
xval1 <- 1 # baseline value for binary predictor
xval2 <- 30 # average value for continuous predictor
xb <- (b0 + b1*(xval1) +.2*xval2)
(1/(1 + exp(-xb))) # baseline probability

# simulation
set.seed(1)
gender <- sample(c(0,1), size = 1000000, replace = TRUE)
age <- round(rnorm(1000000, 30, 3))
xb <- -9 + .25*gender + 0.2*age
p <- 1/(1 + exp(-xb))
y <- rbinom(n = 1000000, size = 1, prob = p)
dat <- data.frame(y,gender,age)
mod <- glm(y ~ gender + age, family = "binomial", data = dat)
summary(mod)

exp(.25) # OR for gender 1.28
exp(.20) # OR for age 1.22


summary(dat)

#-----------------------------------------
# power calculation using 'pwrss' package
#-----------------------------------------

# https://cran.r-project.org/web/packages/pwrss/vignettes/examples.html#4_Logistic_Regression_(Wald’s_z_Test)

# see also: https://pwrss.shinyapps.io/index/

# Bulus, M. (2023). pwrss: Statistical Power and Sample Size Calculation Tools. R package version 0.3.1. https://CRAN.R-project.org/package=pwrss

# install.packages("pwrss")
library(pwrss)


#
# power calc and simulation for gender
#

summary(dat)

# p0 = baseline probability
# odds.ratio = effect we are trying to detect. Cohen (1988) 1.44 (very small) 2.88 (small) 4.27 (large)
# Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Ed). Routledge.
# r2.0ther captures correlation structure between other variables in model
# Cohen (1988) .02 (very small), .13 (small), .26 (large)
# dist = distribution of the variable 'x' for the effect you are estimating
#
# conservative settings - detecting OR for age
pwrss.z.logreg(p0 = .06, odds.ratio = 1.28, r2.other.x = .25,
               power = 0.80, alpha = 0.05,
               dist = "binomial")$n

# indicates we need around 10,000 observations


set.seed(9876) # make this repeatable 1234567
Sn = 10000
pval <- c()
repNum <- c()
for (i in 1:500) {
  sample <- dat[sample(nrow(dat), Sn, replace = T), ] # sample from treatment group
  summ <- summary(glm(y ~ gender + age, family = "binomial", data = sample))
  p <- coef(summ)[11] # get pval for effect you are interested in
  pval<- c(pval, p)
  repNum <- c(repNum,i)
  print(cbind("Iteration:", i)) # comment this out to stop printing to log
}

rpt <- data.frame(repNum,pval)

rm(i,summ,p,pval,repNum) # cleanup

sum(rpt$pval<.05)/nrow(rpt)


#
# power calc and simulation for age
#

summary(dat)

# p0 = baseline probability
# odds.ratio = effect we are trying to detect. Cohen (1988) 1.44 (very small) 2.88 (small) 4.27 (large)
# Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Ed). Routledge.
# r2.0ther captures correlation structure between other variables in model
# Cohen (1988) .02 (very small), .13 (small), .26 (large)
# dist = distribution of the variable 'x' for the effect you are estimating
#
# conservative settings - detecting OR for age
pwrss.z.logreg(p0 = .06, odds.ratio = 1.22, r2.other.x = .25,
               power = 0.80, alpha = 0.05,
               dist = "normal")$n

# indicates we need around 5000 observations


set.seed(9876) # make this repeatable 1234567
Sn = 1000
pval <- c()
repNum <- c()
for (i in 1:500) {
  sample <- dat[sample(nrow(dat), Sn, replace = T), ] # sample from treatment group
  summ <- summary(glm(y ~ gender + age, family = "binomial", data = sample))
  p <- coef(summ)[12] # get pval for effect you are interested in
  pval<- c(pval, p)
  repNum <- c(repNum,i)
  print(cbind("Iteration:", i)) # comment this out to stop printing to log
}

rpt <- data.frame(repNum,pval)

rm(i,summ,p,pval,repNum) # cleanup

sum(rpt$pval<.05)/nrow(rpt)


# the power calculation seemed very conservative compared to the simulation
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: power simulation logistic.R
	# \| DATE: 2/3/24
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE: R Training
	# *----------------------------------------------------------------
	# \| PURPOSE: power calculations and simulations
	# *----------------------------------------------------------------


	rm(list=ls()) # get rid of any existing data
	cat("\f") # clear console
	dev.off() # clear plots
	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

	library(dplyr)
	library(Hmisc)


	#----------------------------------------------------
	# simulate data for logistic regression
	#---------------------------------------------------

	# based on: https://library.virginia.edu/data/articles/simulating-a-logistic-regression-model

	# simulate population

	# use this to tweak simulation parameters to get effect sizes and baseline probabilities
	b0 <- -9 # intercept
	b1 <- .25 # effect for x1
	b2 <- .2 # effect for x2
	xval1 <- 1 # baseline value for binary predictor
	xval2 <- 30 # average value for continuous predictor
	xb <- (b0 + b1(xval1) +.2xval2)
	(1/(1 + exp(-xb))) # baseline probability

	# simulation
	set.seed(1)
	gender <- sample(c(0,1), size = 1000000, replace = TRUE)
	age <- round(rnorm(1000000, 30, 3))
	xb <- -9 + .25gender + 0.2age
	p <- 1/(1 + exp(-xb))
	y <- rbinom(n = 1000000, size = 1, prob = p)
	dat <- data.frame(y,gender,age)
	mod <- glm(y ~ gender + age, family = "binomial", data = dat)
	summary(mod)

	exp(.25) # OR for gender 1.28
	exp(.20) # OR for age 1.22


	summary(dat)

	#-----------------------------------------
	# power calculation using 'pwrss' package
	#-----------------------------------------

	# https://cran.r-project.org/web/packages/pwrss/vignettes/examples.html#4_Logistic_Regression_(Wald’s_z_Test)

	# see also: https://pwrss.shinyapps.io/index/

	# Bulus, M. (2023). pwrss: Statistical Power and Sample Size Calculation Tools. R package version 0.3.1. https://CRAN.R-project.org/package=pwrss

	# install.packages("pwrss")
	library(pwrss)


	#
	# power calc and simulation for gender
	#

	summary(dat)

	# p0 = baseline probability
	# odds.ratio = effect we are trying to detect. Cohen (1988) 1.44 (very small) 2.88 (small) 4.27 (large)
	# Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Ed). Routledge.
	# r2.0ther captures correlation structure between other variables in model
	# Cohen (1988) .02 (very small), .13 (small), .26 (large)
	# dist = distribution of the variable 'x' for the effect you are estimating
	#
	# conservative settings - detecting OR for age
	pwrss.z.logreg(p0 = .06, odds.ratio = 1.28, r2.other.x = .25,
	power = 0.80, alpha = 0.05,
	dist = "binomial")$n

	# indicates we need around 10,000 observations


	set.seed(9876) # make this repeatable 1234567
	Sn = 10000
	pval <- c()
	repNum <- c()
	for (i in 1:500) {
	sample <- dat[sample(nrow(dat), Sn, replace = T), ] # sample from treatment group
	summ <- summary(glm(y ~ gender + age, family = "binomial", data = sample))
	p <- coef(summ)[11] # get pval for effect you are interested in
	pval<- c(pval, p)
	repNum <- c(repNum,i)
	print(cbind("Iteration:", i)) # comment this out to stop printing to log
	}

	rpt <- data.frame(repNum,pval)

	rm(i,summ,p,pval,repNum) # cleanup

	sum(rpt$pval<.05)/nrow(rpt)



	#
	# power calc and simulation for age
	#

	summary(dat)

	# p0 = baseline probability
	# odds.ratio = effect we are trying to detect. Cohen (1988) 1.44 (very small) 2.88 (small) 4.27 (large)
	# Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Ed). Routledge.
	# r2.0ther captures correlation structure between other variables in model
	# Cohen (1988) .02 (very small), .13 (small), .26 (large)
	# dist = distribution of the variable 'x' for the effect you are estimating
	#
	# conservative settings - detecting OR for age
	pwrss.z.logreg(p0 = .06, odds.ratio = 1.22, r2.other.x = .25,
	power = 0.80, alpha = 0.05,
	dist = "normal")$n

	# indicates we need around 5000 observations


	set.seed(9876) # make this repeatable 1234567
	Sn = 1000
	pval <- c()
	repNum <- c()
	for (i in 1:500) {
	sample <- dat[sample(nrow(dat), Sn, replace = T), ] # sample from treatment group
	summ <- summary(glm(y ~ gender + age, family = "binomial", data = sample))
	p <- coef(summ)[12] # get pval for effect you are interested in
	pval<- c(pval, p)
	repNum <- c(repNum,i)
	print(cbind("Iteration:", i)) # comment this out to stop printing to log
	}

	rpt <- data.frame(repNum,pval)

	rm(i,summ,p,pval,repNum) # cleanup

	sum(rpt$pval<.05)/nrow(rpt)


	# the power calculation seemed very conservative compared to the simulation