Last active
August 30, 2021 07:22
-
-
Save mathijsdeen/736a7f7e219b21bfddec63f4852b5805 to your computer and use it in GitHub Desktop.
Power analysis for multilevel models (lme/gls) on RCT data using simulation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(MASS) | |
library(tidyr) | |
library(dplyr) | |
library(nlme) | |
library(parallel) | |
library(tictoc) | |
library(ggplot2) | |
library(effsize) | |
# Function to generate an ARH(1) covariance matrix | |
gen_cov_ar1h <- function(rho, sds){ | |
s <- matrix(sds) | |
g <- length(sds) | |
rho_init <- rho ^ as.matrix(dist(1:g)) # autoregression | |
cov_init <- s %*% t(s) # heterogeneity | |
return(rho_init * cov_init) | |
} | |
# Function for data generation | |
gen_data <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, timetype, empirical){ | |
library(MASS) # use of library() inside function for parallellization | |
library(dplyr) # purposes (maybe there are ways to circumvent this) | |
library(tidyr) | |
covmat1 <- gen_cov_ar1h(rho1, sds1) | |
covmat2 <- gen_cov_ar1h(rho2, sds2) | |
ds1 <- data.frame(mvrnorm(n = n1, | |
mu = ms1, | |
Sigma = covmat1, | |
empirical = empirical)) | |
ds2 <- data.frame(mvrnorm(n = n2, | |
mu = ms2, | |
Sigma = covmat2, | |
empirical = empirical)) | |
names(ds1) <- names(ds2) <- paste0(rep("time", length(ms1)), | |
0:(length(ms1)-1)) | |
ds1$treatment <- "control" | |
ds2$treatment <- "experimental" | |
ds1$id <- 1:n1 | |
ds2$id <- (n1+1):(n1+n2) | |
ds <- rbind(ds1,ds2) | |
ds_long <- ds %>% | |
pivot_longer(cols = -c(id, treatment), | |
names_to = "time", | |
names_prefix = "time", | |
values_to = "y") %>% | |
mutate(treatment = as.factor(treatment), | |
treatment = relevel(treatment, ref = "control"), | |
time = timetype(time)) | |
return(ds_long) | |
} | |
# Function for applying method to generated data | |
inner_sim <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, seeds, seednr, method = "gls", timetype = as.numeric){ | |
library(nlme) | |
set.seed(10000 * seeds[seednr]) # setting seed inside inner loop to prevent differences between Mersenne-Twister and L'Ecuyer-CMRG | |
ds_long <- gen_data(n1 = n1, | |
n2 = n2, | |
sds1 = sds1, | |
sds2 = sds2, | |
ms1 = ms1, | |
ms2 = ms2, | |
rho1 = rho1, | |
rho2 = rho2, | |
timetype = timetype, | |
empirical = FALSE) | |
if(method == "lme") out <- tryCatch(anova(lme(fixed = y~time*treatment, | |
data = ds_long, | |
random = ~1+time|id, | |
method = "REML"))["time:treatment","p-value"], | |
error = function(x) NA) | |
if(method == "gls") out <- tryCatch(anova(gls(model = y~time*treatment, | |
data = ds_long, | |
correlation = corAR1(form=~1|id), | |
weights = varIdent(form=~1|time), | |
method = "REML"))["time:treatment","p-value"], | |
error = function(x) NA) | |
return(out) | |
} | |
# Function for performing data generation and analysis, sequential | |
outer_sim_s <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2, | |
method = "gls", timetype = as.numeric, n.iter = 100){ | |
outmat <- matrix(NA, nrow = n.iter, ncol = length(n1s)) | |
n2s <- round(n1s * nratio) | |
seeds <- runif(n.iter,0,1) | |
pb <- txtProgressBar(min = 0, | |
max = length(n1s), | |
style = 3) | |
for(i in 1:length(n1s)){ | |
for(j in 1:n.iter){ | |
outmat[j, i] <- inner_sim(n1s[i], n2s[i], sds1, sds2, ms1, ms2, rho1, rho2, seeds, j, method, timetype) | |
} | |
setTxtProgressBar(pb,i) | |
} | |
outmat <- data.frame(outmat) | |
names(outmat) <- paste0("n_",n1s,"_",n2s) | |
return(outmat) | |
} | |
# Function for performing data generation and analysis, parallellized | |
outer_sim_p <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2, | |
method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 2){ | |
outmat <- matrix(NA, nrow = n.iter, ncol=length(n1s)) | |
n2s <- round(n1s * nratio) | |
pb <- txtProgressBar(min = 0, | |
max = length(n1s), | |
style = 3) | |
for(i in 1:length(n1s)){ | |
n1 <- n1s[i] | |
n2 <- n2s[i] | |
cl <- makeCluster(n.cores) | |
seeds <- runif(n.iter,0,1) | |
clusterExport(cl = cl, | |
varlist = c("n1", "n2", "sds1", "sds2", "ms1", "ms2", "rho1", "rho2", "seeds", | |
"method", "timetype", "inner_sim", "gen_cov_ar1h", "gen_data"), | |
envir = environment()) | |
splitclusters <- 1:n.iter | |
par_out <- parSapplyLB(cl = cl, | |
X = splitclusters, | |
FUN = function(x) inner_sim(n1, n2, sds1, sds2, ms1, ms2, rho1, | |
rho2, seeds, x, method, timetype)) | |
outmat[,i] <- t(par_out) | |
stopCluster(cl) | |
setTxtProgressBar(pb,i) | |
} | |
outmat <- data.frame(outmat) | |
names(outmat) <- paste0("n_",n1s,"_",n2s) | |
return(outmat) | |
} | |
# Wrapper function | |
sim.multilevel.RCT <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2, | |
method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 1){ | |
if(n.cores == 1) { | |
out <- outer_sim_s(n1s, nratio, sds1, sds2, ms1, ms2, | |
rho1, rho2, method, timetype, n.iter) | |
} else if(n.cores > 1) { | |
out <- outer_sim_p(n1s, nratio, sds1, sds2, ms1, ms2, | |
rho1, rho2, method, timetype, n.iter, n.cores) | |
} else { | |
stop("n.cores should be >= 1") | |
} | |
return(out) | |
} | |
###################################################### | |
# A testrun with made-up means, variances and rhos: # | |
###################################################### | |
tic() | |
set.seed(3) | |
d <- sim.multilevel.RCT(n1s = c(30:40), | |
nratio = 1, | |
ms1 = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0), | |
sds1 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5), | |
rho1 = .7, | |
ms2 = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2), | |
sds2 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5), | |
rho2 = .7, | |
method = "lme", | |
timetype = as.numeric, | |
n.iter = 1000, | |
n.cores = 16) | |
toc() | |
# Power for given sample sizes | |
colMeans(d < .05, na.rm=TRUE) # columns are named "n_<<sample size group 1>>_<<sample size group 2>> | |
# Generate data for desired sample size from above. Use empirical = TRUE for exact results | |
q <- gen_data(n1 = 37, | |
n2 = 37, | |
sds1 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5), | |
sds2 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5), | |
ms1 = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0), | |
ms2 = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2), | |
rho1 = .7, | |
rho2 = .7, | |
timetype = as.numeric, | |
empirical = TRUE) | |
# Run a mixed effects model and interpret coefficients and p-values | |
summary(lme(y~treatment*time,random=~1+time|id, data=q, method="REML")) | |
anova.lme(lme(y~treatment*time,random=~1+time|id, data=q, method="REML")) | |
# Plot the data | |
q %>% | |
group_by(time, treatment) %>% # filter(treatment=="experimental") %>% | |
mutate(mn_y_time_treat = mean(y), | |
y_ci_ul = mean(y) + 1.96 * sd(y), | |
y_ci_ll = mean(y) - 1.96 * sd(y)) %>% | |
ggplot(aes(x=time, y=y, col=treatment)) + | |
geom_point(position=position_dodge(width=.2)) + | |
geom_line(aes(x=time, y=mn_y_time_treat), size = 1.5) + | |
geom_ribbon(aes(x=time*1.04-.15,ymin=y_ci_ll, ymax=y_ci_ul, fill = treatment), alpha=.1) + | |
scale_x_continuous(breaks = 0:8, labels = 0:8) + | |
theme_classic() | |
# Let's look at effect sizes for difference scores (wrt baseline) at each timepoint | |
q %>% | |
pivot_wider(id_cols = c(id, treatment), | |
names_from = time, | |
names_prefix = "time", | |
values_from = y) %>% | |
mutate(diff1 = time0 - time1, | |
diff2 = time0 - time2, | |
diff3 = time0 - time3, | |
diff4 = time0 - time4, | |
diff5 = time0 - time5, | |
diff6 = time0 - time6, | |
diff7 = time0 - time7, | |
diff8 = time0 - time8) -> q2 | |
ds <- rep(NA,8) | |
ds[1] <- cohen.d(diff1~treatment, data=q2)$estimate | |
ds[2] <- cohen.d(diff2~treatment, data=q2)$estimate | |
ds[3] <- cohen.d(diff3~treatment, data=q2)$estimate | |
ds[4] <- cohen.d(diff4~treatment, data=q2)$estimate | |
ds[5] <- cohen.d(diff5~treatment, data=q2)$estimate | |
ds[6] <- cohen.d(diff6~treatment, data=q2)$estimate | |
ds[7] <- cohen.d(diff7~treatment, data=q2)$estimate | |
ds[8] <- cohen.d(diff8~treatment, data=q2)$estimate | |
cbind(ds) |
When applying the code to new data, some things might need to be altered to suit your situation. For example:
- When setting up the simulation study:
- Within
inner_sim
, the code above retrieves thep-value
oftime:treatment
from theanova
. Power is calculated from this interaction effect. If your main interest is in some other parameter (e.g., power for an interaction at a specific timepoint with time as a factor), changes to this part of the code are needed. - The analyses in the simulation assume linearity (when
timetype
is set toas.numeric
). Change the model inlme(..)
and/orgls(..)
if you want to introduce curvature. - If you don't want to assume a heterogeneous autoregressive covariance structure, change or replace the
gen_cov_arh1
function and/or consider the specification ofsds1
andsds2
according to your assumptions. For example:- For a homogeneous first-order autoregressive covariance structure, use invariant
sds1
andsds2
. After all, you are assuming that the standard deviations are not changing over time. - For a heterogeneous compound symmetric covariance structure, use
rho_init <- rho ^ as.matrix(1-1*diag(g))
withingen_cov_arh1
. - For a homogeneous compound symmetric covariance structure, combine
rho_init <- rho ^ as.matrix(1-1*diag(g))
with invariantsds1
andsds2
.
- For a homogeneous first-order autoregressive covariance structure, use invariant
- Within
- When plotting the data:
- The
width
withinposition_dodge
was optimized for the example above, viewed within a regularly sized RStudio Plots pane on a full HD monitor. - The argument
x=time*1.04-.15
withingeom_ribbon
was optimized for the given example as well. This alteration of thetime
value ingeom_ribbon(aes(x=..))
was chosen to include the dodged dots at the first and last timepoints within the ribbon. Merely esthetics... - For the
breaks
andlabels
arguments inscale_x_continuous
, the 8 should be replaced with the maximum timepoint number in your situation.
- The
- When investigating the effect sizes:
- The number of difference scores might be different in your situation.
- This goes for the number of effect sizes as well, obviously.
- Not done in the code above, but you might want to do these computations within a loop. Duh.
Please note that functionality is mainly tested with the default timetype = as.numeric
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Arguments for
sim.multilevel.RCT
:n1s
: a vector of sample sizes for the control group.nratio
: how much bigger should the experimental be, compared to the control group?sds1
,sds2
: a vector standard deviations at each timepoint for groups 1 (control) and 2 (experimental).ms1
,ms2
: a vector of means at each timepoint for groups 1 (control) and 2 (experimental).rho1
,rho2
: autocorrelation for groups 1 (control) and 2 (experimental).method
: which method will be used to analyse the data. Options are:"lme"
for linear mixed effects"gls"
for covariance pattern models using generalized least squares)timetype
: function, eitheras.numeric
oras.factor
, indicating how the time variable should be included in the analysis.n.iter
: how many simulation iterations within each sample size configuration?n.cores
: how many CPU cores are to be used in the simulation?