Last active
August 30, 2021 07:22
-
-
Save mathijsdeen/736a7f7e219b21bfddec63f4852b5805 to your computer and use it in GitHub Desktop.
Power analysis for multilevel models (lme/gls) on RCT data using simulation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(MASS) | |
library(tidyr) | |
library(dplyr) | |
library(nlme) | |
library(parallel) | |
library(tictoc) | |
library(ggplot2) | |
library(effsize) | |
# Function to generate an ARH(1) covariance matrix | |
gen_cov_ar1h <- function(rho, sds){ | |
s <- matrix(sds) | |
g <- length(sds) | |
rho_init <- rho ^ as.matrix(dist(1:g)) # autoregression | |
cov_init <- s %*% t(s) # heterogeneity | |
return(rho_init * cov_init) | |
} | |
# Function for data generation | |
gen_data <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, timetype, empirical){ | |
library(MASS) # use of library() inside function for parallellization | |
library(dplyr) # purposes (maybe there are ways to circumvent this) | |
library(tidyr) | |
covmat1 <- gen_cov_ar1h(rho1, sds1) | |
covmat2 <- gen_cov_ar1h(rho2, sds2) | |
ds1 <- data.frame(mvrnorm(n = n1, | |
mu = ms1, | |
Sigma = covmat1, | |
empirical = empirical)) | |
ds2 <- data.frame(mvrnorm(n = n2, | |
mu = ms2, | |
Sigma = covmat2, | |
empirical = empirical)) | |
names(ds1) <- names(ds2) <- paste0(rep("time", length(ms1)), | |
0:(length(ms1)-1)) | |
ds1$treatment <- "control" | |
ds2$treatment <- "experimental" | |
ds1$id <- 1:n1 | |
ds2$id <- (n1+1):(n1+n2) | |
ds <- rbind(ds1,ds2) | |
ds_long <- ds %>% | |
pivot_longer(cols = -c(id, treatment), | |
names_to = "time", | |
names_prefix = "time", | |
values_to = "y") %>% | |
mutate(treatment = as.factor(treatment), | |
treatment = relevel(treatment, ref = "control"), | |
time = timetype(time)) | |
return(ds_long) | |
} | |
# Function for applying method to generated data | |
inner_sim <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, seeds, seednr, method = "gls", timetype = as.numeric){ | |
library(nlme) | |
set.seed(10000 * seeds[seednr]) # setting seed inside inner loop to prevent differences between Mersenne-Twister and L'Ecuyer-CMRG | |
ds_long <- gen_data(n1 = n1, | |
n2 = n2, | |
sds1 = sds1, | |
sds2 = sds2, | |
ms1 = ms1, | |
ms2 = ms2, | |
rho1 = rho1, | |
rho2 = rho2, | |
timetype = timetype, | |
empirical = FALSE) | |
if(method == "lme") out <- tryCatch(anova(lme(fixed = y~time*treatment, | |
data = ds_long, | |
random = ~1+time|id, | |
method = "REML"))["time:treatment","p-value"], | |
error = function(x) NA) | |
if(method == "gls") out <- tryCatch(anova(gls(model = y~time*treatment, | |
data = ds_long, | |
correlation = corAR1(form=~1|id), | |
weights = varIdent(form=~1|time), | |
method = "REML"))["time:treatment","p-value"], | |
error = function(x) NA) | |
return(out) | |
} | |
# Function for performing data generation and analysis, sequential | |
outer_sim_s <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2, | |
method = "gls", timetype = as.numeric, n.iter = 100){ | |
outmat <- matrix(NA, nrow = n.iter, ncol = length(n1s)) | |
n2s <- round(n1s * nratio) | |
seeds <- runif(n.iter,0,1) | |
pb <- txtProgressBar(min = 0, | |
max = length(n1s), | |
style = 3) | |
for(i in 1:length(n1s)){ | |
for(j in 1:n.iter){ | |
outmat[j, i] <- inner_sim(n1s[i], n2s[i], sds1, sds2, ms1, ms2, rho1, rho2, seeds, j, method, timetype) | |
} | |
setTxtProgressBar(pb,i) | |
} | |
outmat <- data.frame(outmat) | |
names(outmat) <- paste0("n_",n1s,"_",n2s) | |
return(outmat) | |
} | |
# Function for performing data generation and analysis, parallellized | |
outer_sim_p <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2, | |
method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 2){ | |
outmat <- matrix(NA, nrow = n.iter, ncol=length(n1s)) | |
n2s <- round(n1s * nratio) | |
pb <- txtProgressBar(min = 0, | |
max = length(n1s), | |
style = 3) | |
for(i in 1:length(n1s)){ | |
n1 <- n1s[i] | |
n2 <- n2s[i] | |
cl <- makeCluster(n.cores) | |
seeds <- runif(n.iter,0,1) | |
clusterExport(cl = cl, | |
varlist = c("n1", "n2", "sds1", "sds2", "ms1", "ms2", "rho1", "rho2", "seeds", | |
"method", "timetype", "inner_sim", "gen_cov_ar1h", "gen_data"), | |
envir = environment()) | |
splitclusters <- 1:n.iter | |
par_out <- parSapplyLB(cl = cl, | |
X = splitclusters, | |
FUN = function(x) inner_sim(n1, n2, sds1, sds2, ms1, ms2, rho1, | |
rho2, seeds, x, method, timetype)) | |
outmat[,i] <- t(par_out) | |
stopCluster(cl) | |
setTxtProgressBar(pb,i) | |
} | |
outmat <- data.frame(outmat) | |
names(outmat) <- paste0("n_",n1s,"_",n2s) | |
return(outmat) | |
} | |
# Wrapper function | |
sim.multilevel.RCT <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2, | |
method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 1){ | |
if(n.cores == 1) { | |
out <- outer_sim_s(n1s, nratio, sds1, sds2, ms1, ms2, | |
rho1, rho2, method, timetype, n.iter) | |
} else if(n.cores > 1) { | |
out <- outer_sim_p(n1s, nratio, sds1, sds2, ms1, ms2, | |
rho1, rho2, method, timetype, n.iter, n.cores) | |
} else { | |
stop("n.cores should be >= 1") | |
} | |
return(out) | |
} | |
###################################################### | |
# A testrun with made-up means, variances and rhos: # | |
###################################################### | |
tic() | |
set.seed(3) | |
d <- sim.multilevel.RCT(n1s = c(30:40), | |
nratio = 1, | |
ms1 = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0), | |
sds1 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5), | |
rho1 = .7, | |
ms2 = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2), | |
sds2 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5), | |
rho2 = .7, | |
method = "lme", | |
timetype = as.numeric, | |
n.iter = 1000, | |
n.cores = 16) | |
toc() | |
# Power for given sample sizes | |
colMeans(d < .05, na.rm=TRUE) # columns are named "n_<<sample size group 1>>_<<sample size group 2>> | |
# Generate data for desired sample size from above. Use empirical = TRUE for exact results | |
q <- gen_data(n1 = 37, | |
n2 = 37, | |
sds1 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5), | |
sds2 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5), | |
ms1 = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0), | |
ms2 = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2), | |
rho1 = .7, | |
rho2 = .7, | |
timetype = as.numeric, | |
empirical = TRUE) | |
# Run a mixed effects model and interpret coefficients and p-values | |
summary(lme(y~treatment*time,random=~1+time|id, data=q, method="REML")) | |
anova.lme(lme(y~treatment*time,random=~1+time|id, data=q, method="REML")) | |
# Plot the data | |
q %>% | |
group_by(time, treatment) %>% # filter(treatment=="experimental") %>% | |
mutate(mn_y_time_treat = mean(y), | |
y_ci_ul = mean(y) + 1.96 * sd(y), | |
y_ci_ll = mean(y) - 1.96 * sd(y)) %>% | |
ggplot(aes(x=time, y=y, col=treatment)) + | |
geom_point(position=position_dodge(width=.2)) + | |
geom_line(aes(x=time, y=mn_y_time_treat), size = 1.5) + | |
geom_ribbon(aes(x=time*1.04-.15,ymin=y_ci_ll, ymax=y_ci_ul, fill = treatment), alpha=.1) + | |
scale_x_continuous(breaks = 0:8, labels = 0:8) + | |
theme_classic() | |
# Let's look at effect sizes for difference scores (wrt baseline) at each timepoint | |
q %>% | |
pivot_wider(id_cols = c(id, treatment), | |
names_from = time, | |
names_prefix = "time", | |
values_from = y) %>% | |
mutate(diff1 = time0 - time1, | |
diff2 = time0 - time2, | |
diff3 = time0 - time3, | |
diff4 = time0 - time4, | |
diff5 = time0 - time5, | |
diff6 = time0 - time6, | |
diff7 = time0 - time7, | |
diff8 = time0 - time8) -> q2 | |
ds <- rep(NA,8) | |
ds[1] <- cohen.d(diff1~treatment, data=q2)$estimate | |
ds[2] <- cohen.d(diff2~treatment, data=q2)$estimate | |
ds[3] <- cohen.d(diff3~treatment, data=q2)$estimate | |
ds[4] <- cohen.d(diff4~treatment, data=q2)$estimate | |
ds[5] <- cohen.d(diff5~treatment, data=q2)$estimate | |
ds[6] <- cohen.d(diff6~treatment, data=q2)$estimate | |
ds[7] <- cohen.d(diff7~treatment, data=q2)$estimate | |
ds[8] <- cohen.d(diff8~treatment, data=q2)$estimate | |
cbind(ds) |
Please note that functionality is mainly tested with the default timetype = as.numeric
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
When applying the code to new data, some things might need to be altered to suit your situation. For example:
inner_sim
, the code above retrieves thep-value
oftime:treatment
from theanova
. Power is calculated from this interaction effect. If your main interest is in some other parameter (e.g., power for an interaction at a specific timepoint with time as a factor), changes to this part of the code are needed.timetype
is set toas.numeric
). Change the model inlme(..)
and/orgls(..)
if you want to introduce curvature.gen_cov_arh1
function and/or consider the specification ofsds1
andsds2
according to your assumptions. For example:sds1
andsds2
. After all, you are assuming that the standard deviations are not changing over time.rho_init <- rho ^ as.matrix(1-1*diag(g))
withingen_cov_arh1
.rho_init <- rho ^ as.matrix(1-1*diag(g))
with invariantsds1
andsds2
.width
withinposition_dodge
was optimized for the example above, viewed within a regularly sized RStudio Plots pane on a full HD monitor.x=time*1.04-.15
withingeom_ribbon
was optimized for the given example as well. This alteration of thetime
value ingeom_ribbon(aes(x=..))
was chosen to include the dodged dots at the first and last timepoints within the ribbon. Merely esthetics...breaks
andlabels
arguments inscale_x_continuous
, the 8 should be replaced with the maximum timepoint number in your situation.