mathijsdeen/Power analysis for multilevel models using simulation.R

## Power analysis for multilevel models using simulation.R
library(MASS)
library(tidyr)
library(dplyr)
library(nlme)
library(parallel)
library(tictoc)
library(ggplot2)
library(effsize)

# Function to generate an ARH(1) covariance matrix
gen_cov_ar1h <- function(rho, sds){
  s        <- matrix(sds)
  g        <- length(sds)
  rho_init <- rho ^ as.matrix(dist(1:g))      # autoregression
  cov_init <- s %*% t(s)                      # heterogeneity
  return(rho_init * cov_init)
}

# Function for data generation
gen_data <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, timetype, empirical){
  library(MASS)                               # use of library() inside function for parallellization
  library(dplyr)                              # purposes (maybe there are ways to circumvent this)
  library(tidyr)
  covmat1       <- gen_cov_ar1h(rho1, sds1)
  covmat2       <- gen_cov_ar1h(rho2, sds2)
  ds1           <- data.frame(mvrnorm(n         = n1,
                                      mu        = ms1,
                                      Sigma     = covmat1,
                                      empirical = empirical))
  ds2           <- data.frame(mvrnorm(n         = n2,
                                      mu        = ms2,
                                      Sigma     = covmat2,
                                      empirical = empirical))
  names(ds1)    <- names(ds2) <- paste0(rep("time", length(ms1)),
                                        0:(length(ms1)-1))
  ds1$treatment <- "control"
  ds2$treatment <- "experimental"
  ds1$id        <- 1:n1
  ds2$id        <- (n1+1):(n1+n2)
  ds            <- rbind(ds1,ds2)
  ds_long       <- ds %>%
    pivot_longer(cols         = -c(id, treatment),
                 names_to     = "time",
                 names_prefix = "time",
                 values_to    = "y") %>%
    mutate(treatment = as.factor(treatment),
           treatment = relevel(treatment, ref = "control"),
           time      = timetype(time))
  return(ds_long)
}

# Function for applying method to generated data
inner_sim <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, seeds, seednr, method = "gls", timetype = as.numeric){
  library(nlme)
  set.seed(10000 * seeds[seednr]) # setting seed inside inner loop to prevent differences between Mersenne-Twister and L'Ecuyer-CMRG
  ds_long <- gen_data(n1        = n1,
                      n2        = n2,
                      sds1      = sds1,
                      sds2      = sds2,
                      ms1       = ms1,
                      ms2       = ms2,
                      rho1      = rho1,
                      rho2      = rho2,
                      timetype  = timetype,
                      empirical = FALSE)
  if(method == "lme") out <- tryCatch(anova(lme(fixed  = y~time*treatment,
                                                data   = ds_long,
                                                random = ~1+time|id,
                                                method = "REML"))["time:treatment","p-value"],
                                      error = function(x) NA)
  if(method == "gls") out <- tryCatch(anova(gls(model       = y~time*treatment,
                                                data        = ds_long,
                                                correlation = corAR1(form=~1|id),
                                                weights     = varIdent(form=~1|time),
                                                method      = "REML"))["time:treatment","p-value"],
                                      error = function(x) NA)
  return(out)
}

# Function for performing data generation and analysis, sequential
outer_sim_s <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
                        method = "gls", timetype = as.numeric, n.iter = 100){
  outmat <- matrix(NA, nrow = n.iter, ncol = length(n1s))
  n2s    <- round(n1s * nratio)
  seeds  <- runif(n.iter,0,1)
  pb     <- txtProgressBar(min   = 0,
                           max   = length(n1s),
                           style = 3)
  for(i in 1:length(n1s)){
    for(j in 1:n.iter){
      outmat[j, i] <- inner_sim(n1s[i], n2s[i], sds1, sds2, ms1, ms2, rho1, rho2, seeds, j, method, timetype)
    }
    setTxtProgressBar(pb,i)
  }
  outmat        <- data.frame(outmat)
  names(outmat) <- paste0("n_",n1s,"_",n2s)
  return(outmat)
}

# Function for performing data generation and analysis, parallellized
outer_sim_p <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
                        method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 2){
  outmat <- matrix(NA, nrow = n.iter, ncol=length(n1s))
  n2s    <- round(n1s * nratio)
  pb     <- txtProgressBar(min   = 0,
                           max   = length(n1s),
                           style = 3)
  for(i in 1:length(n1s)){
    n1            <- n1s[i]
    n2            <- n2s[i]
    cl            <- makeCluster(n.cores)
    seeds         <- runif(n.iter,0,1)
    clusterExport(cl      = cl,
                  varlist = c("n1", "n2", "sds1", "sds2", "ms1", "ms2", "rho1", "rho2", "seeds",
                              "method", "timetype", "inner_sim", "gen_cov_ar1h", "gen_data"),
                  envir   = environment())
    splitclusters <- 1:n.iter
    par_out       <- parSapplyLB(cl  = cl,
                                 X   = splitclusters,
                                 FUN = function(x) inner_sim(n1, n2, sds1, sds2, ms1, ms2, rho1,
                                                             rho2, seeds, x, method, timetype))
    outmat[,i]    <- t(par_out)
    stopCluster(cl)
    setTxtProgressBar(pb,i)
  }
  outmat          <- data.frame(outmat)
  names(outmat)   <- paste0("n_",n1s,"_",n2s)
  return(outmat)
}

# Wrapper function
sim.multilevel.RCT <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
                               method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 1){
  if(n.cores == 1) {
    out <- outer_sim_s(n1s, nratio, sds1, sds2, ms1, ms2,
                       rho1, rho2, method, timetype, n.iter)
  } else if(n.cores > 1) {
    out <- outer_sim_p(n1s, nratio, sds1, sds2, ms1, ms2,
                       rho1, rho2, method, timetype, n.iter, n.cores)
  } else {
    stop("n.cores should be >= 1")
  }
  return(out)
}

######################################################
# A testrun with made-up means, variances and rhos:  #
######################################################

tic()
set.seed(3)
d <- sim.multilevel.RCT(n1s      = c(30:40),
                        nratio   = 1,
                        ms1      = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0),
                        sds1     = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
                        rho1     = .7,
                        ms2      = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2),
                        sds2     = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
                        rho2     = .7,
                        method   = "lme",
                        timetype = as.numeric,
                        n.iter   = 1000,
                        n.cores  = 16)
toc()

# Power for given sample sizes
colMeans(d < .05, na.rm=TRUE)       # columns are named "n_<<sample size group 1>>_<<sample size group 2>>

# Generate data for desired sample size from above. Use empirical = TRUE for exact results
q <- gen_data(n1        = 37,
              n2        = 37,
              sds1      = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
              sds2      = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
              ms1       = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0),
              ms2       = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2),
              rho1      = .7,
              rho2      = .7,
              timetype  = as.numeric,
              empirical = TRUE)

# Run a mixed effects model and interpret coefficients and p-values
summary(lme(y~treatment*time,random=~1+time|id, data=q, method="REML"))
anova.lme(lme(y~treatment*time,random=~1+time|id, data=q, method="REML"))

# Plot the data
q %>%
  group_by(time, treatment) %>% #  filter(treatment=="experimental") %>%
  mutate(mn_y_time_treat = mean(y),
         y_ci_ul = mean(y) + 1.96 * sd(y),
         y_ci_ll = mean(y) - 1.96 * sd(y)) %>%
  ggplot(aes(x=time, y=y, col=treatment)) +
  geom_point(position=position_dodge(width=.2)) +
  geom_line(aes(x=time, y=mn_y_time_treat), size = 1.5) +
  geom_ribbon(aes(x=time*1.04-.15,ymin=y_ci_ll, ymax=y_ci_ul, fill = treatment), alpha=.1) +
  scale_x_continuous(breaks = 0:8, labels = 0:8) +
  theme_classic()

# Let's look at effect sizes for difference scores (wrt baseline) at each timepoint
q %>%
  pivot_wider(id_cols = c(id, treatment),
              names_from = time,
              names_prefix = "time",
              values_from = y) %>%
  mutate(diff1 = time0 - time1,
         diff2 = time0 - time2,
         diff3 = time0 - time3,
         diff4 = time0 - time4,
         diff5 = time0 - time5,
         diff6 = time0 - time6,
         diff7 = time0 - time7,
         diff8 = time0 - time8) -> q2

ds <- rep(NA,8)
ds[1] <- cohen.d(diff1~treatment, data=q2)$estimate
ds[2] <- cohen.d(diff2~treatment, data=q2)$estimate
ds[3] <- cohen.d(diff3~treatment, data=q2)$estimate
ds[4] <- cohen.d(diff4~treatment, data=q2)$estimate
ds[5] <- cohen.d(diff5~treatment, data=q2)$estimate
ds[6] <- cohen.d(diff6~treatment, data=q2)$estimate
ds[7] <- cohen.d(diff7~treatment, data=q2)$estimate
ds[8] <- cohen.d(diff8~treatment, data=q2)$estimate

cbind(ds)
	library(MASS)
	library(tidyr)
	library(dplyr)
	library(nlme)
	library(parallel)
	library(tictoc)
	library(ggplot2)
	library(effsize)

	# Function to generate an ARH(1) covariance matrix
	gen_cov_ar1h <- function(rho, sds){
	s <- matrix(sds)
	g <- length(sds)
	rho_init <- rho ^ as.matrix(dist(1:g)) # autoregression
	cov_init <- s %*% t(s) # heterogeneity
	return(rho_init * cov_init)
	}

	# Function for data generation
	gen_data <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, timetype, empirical){
	library(MASS) # use of library() inside function for parallellization
	library(dplyr) # purposes (maybe there are ways to circumvent this)
	library(tidyr)
	covmat1 <- gen_cov_ar1h(rho1, sds1)
	covmat2 <- gen_cov_ar1h(rho2, sds2)
	ds1 <- data.frame(mvrnorm(n = n1,
	mu = ms1,
	Sigma = covmat1,
	empirical = empirical))
	ds2 <- data.frame(mvrnorm(n = n2,
	mu = ms2,
	Sigma = covmat2,
	empirical = empirical))
	names(ds1) <- names(ds2) <- paste0(rep("time", length(ms1)),
	0:(length(ms1)-1))
	ds1$treatment <- "control"
	ds2$treatment <- "experimental"
	ds1$id <- 1:n1
	ds2$id <- (n1+1):(n1+n2)
	ds <- rbind(ds1,ds2)
	ds_long <- ds %>%
	pivot_longer(cols = -c(id, treatment),
	names_to = "time",
	names_prefix = "time",
	values_to = "y") %>%
	mutate(treatment = as.factor(treatment),
	treatment = relevel(treatment, ref = "control"),
	time = timetype(time))
	return(ds_long)
	}

	# Function for applying method to generated data
	inner_sim <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, seeds, seednr, method = "gls", timetype = as.numeric){
	library(nlme)
	set.seed(10000 * seeds[seednr]) # setting seed inside inner loop to prevent differences between Mersenne-Twister and L'Ecuyer-CMRG
	ds_long <- gen_data(n1 = n1,
	n2 = n2,
	sds1 = sds1,
	sds2 = sds2,
	ms1 = ms1,
	ms2 = ms2,
	rho1 = rho1,
	rho2 = rho2,
	timetype = timetype,
	empirical = FALSE)
	if(method == "lme") out <- tryCatch(anova(lme(fixed = y~time*treatment,
	data = ds_long,
	random = ~1+time\|id,
	method = "REML"))["time:treatment","p-value"],
	error = function(x) NA)
	if(method == "gls") out <- tryCatch(anova(gls(model = y~time*treatment,
	data = ds_long,
	correlation = corAR1(form=~1\|id),
	weights = varIdent(form=~1\|time),
	method = "REML"))["time:treatment","p-value"],
	error = function(x) NA)
	return(out)
	}

	# Function for performing data generation and analysis, sequential
	outer_sim_s <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
	method = "gls", timetype = as.numeric, n.iter = 100){
	outmat <- matrix(NA, nrow = n.iter, ncol = length(n1s))
	n2s <- round(n1s * nratio)
	seeds <- runif(n.iter,0,1)
	pb <- txtProgressBar(min = 0,
	max = length(n1s),
	style = 3)
	for(i in 1:length(n1s)){
	for(j in 1:n.iter){
	outmat[j, i] <- inner_sim(n1s[i], n2s[i], sds1, sds2, ms1, ms2, rho1, rho2, seeds, j, method, timetype)
	}
	setTxtProgressBar(pb,i)
	}
	outmat <- data.frame(outmat)
	names(outmat) <- paste0("n_",n1s,"_",n2s)
	return(outmat)
	}

	# Function for performing data generation and analysis, parallellized
	outer_sim_p <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
	method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 2){
	outmat <- matrix(NA, nrow = n.iter, ncol=length(n1s))
	n2s <- round(n1s * nratio)
	pb <- txtProgressBar(min = 0,
	max = length(n1s),
	style = 3)
	for(i in 1:length(n1s)){
	n1 <- n1s[i]
	n2 <- n2s[i]
	cl <- makeCluster(n.cores)
	seeds <- runif(n.iter,0,1)
	clusterExport(cl = cl,
	varlist = c("n1", "n2", "sds1", "sds2", "ms1", "ms2", "rho1", "rho2", "seeds",
	"method", "timetype", "inner_sim", "gen_cov_ar1h", "gen_data"),
	envir = environment())
	splitclusters <- 1:n.iter
	par_out <- parSapplyLB(cl = cl,
	X = splitclusters,
	FUN = function(x) inner_sim(n1, n2, sds1, sds2, ms1, ms2, rho1,
	rho2, seeds, x, method, timetype))
	outmat[,i] <- t(par_out)
	stopCluster(cl)
	setTxtProgressBar(pb,i)
	}
	outmat <- data.frame(outmat)
	names(outmat) <- paste0("n_",n1s,"_",n2s)
	return(outmat)
	}

	# Wrapper function
	sim.multilevel.RCT <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
	method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 1){
	if(n.cores == 1) {
	out <- outer_sim_s(n1s, nratio, sds1, sds2, ms1, ms2,
	rho1, rho2, method, timetype, n.iter)
	} else if(n.cores > 1) {
	out <- outer_sim_p(n1s, nratio, sds1, sds2, ms1, ms2,
	rho1, rho2, method, timetype, n.iter, n.cores)
	} else {
	stop("n.cores should be >= 1")
	}
	return(out)
	}

	######################################################
	# A testrun with made-up means, variances and rhos: #
	######################################################

	tic()
	set.seed(3)
	d <- sim.multilevel.RCT(n1s = c(30:40),
	nratio = 1,
	ms1 = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0),
	sds1 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
	rho1 = .7,
	ms2 = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2),
	sds2 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
	rho2 = .7,
	method = "lme",
	timetype = as.numeric,
	n.iter = 1000,
	n.cores = 16)
	toc()

	# Power for given sample sizes
	colMeans(d < .05, na.rm=TRUE) # columns are named "n_<<sample size group 1>>_<<sample size group 2>>

	# Generate data for desired sample size from above. Use empirical = TRUE for exact results
	q <- gen_data(n1 = 37,
	n2 = 37,
	sds1 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
	sds2 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
	ms1 = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0),
	ms2 = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2),
	rho1 = .7,
	rho2 = .7,
	timetype = as.numeric,
	empirical = TRUE)

	# Run a mixed effects model and interpret coefficients and p-values
	summary(lme(y~treatment*time,random=~1+time\|id, data=q, method="REML"))
	anova.lme(lme(y~treatment*time,random=~1+time\|id, data=q, method="REML"))

	# Plot the data
	q %>%
	group_by(time, treatment) %>% # filter(treatment=="experimental") %>%
	mutate(mn_y_time_treat = mean(y),
	y_ci_ul = mean(y) + 1.96 * sd(y),
	y_ci_ll = mean(y) - 1.96 * sd(y)) %>%
	ggplot(aes(x=time, y=y, col=treatment)) +
	geom_point(position=position_dodge(width=.2)) +
	geom_line(aes(x=time, y=mn_y_time_treat), size = 1.5) +
	geom_ribbon(aes(x=time*1.04-.15,ymin=y_ci_ll, ymax=y_ci_ul, fill = treatment), alpha=.1) +
	scale_x_continuous(breaks = 0:8, labels = 0:8) +
	theme_classic()

	# Let's look at effect sizes for difference scores (wrt baseline) at each timepoint
	q %>%
	pivot_wider(id_cols = c(id, treatment),
	names_from = time,
	names_prefix = "time",
	values_from = y) %>%
	mutate(diff1 = time0 - time1,
	diff2 = time0 - time2,
	diff3 = time0 - time3,
	diff4 = time0 - time4,
	diff5 = time0 - time5,
	diff6 = time0 - time6,
	diff7 = time0 - time7,
	diff8 = time0 - time8) -> q2

	ds <- rep(NA,8)
	ds[1] <- cohen.d(diff1~treatment, data=q2)$estimate
	ds[2] <- cohen.d(diff2~treatment, data=q2)$estimate
	ds[3] <- cohen.d(diff3~treatment, data=q2)$estimate
	ds[4] <- cohen.d(diff4~treatment, data=q2)$estimate
	ds[5] <- cohen.d(diff5~treatment, data=q2)$estimate
	ds[6] <- cohen.d(diff6~treatment, data=q2)$estimate
	ds[7] <- cohen.d(diff7~treatment, data=q2)$estimate
	ds[8] <- cohen.d(diff8~treatment, data=q2)$estimate

	cbind(ds)