Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mathijsdeen/736a7f7e219b21bfddec63f4852b5805 to your computer and use it in GitHub Desktop.
Save mathijsdeen/736a7f7e219b21bfddec63f4852b5805 to your computer and use it in GitHub Desktop.
Power analysis for multilevel models (lme/gls) on RCT data using simulation.
library(MASS)
library(tidyr)
library(dplyr)
library(nlme)
library(parallel)
library(tictoc)
library(ggplot2)
library(effsize)
# Function to generate an ARH(1) covariance matrix
gen_cov_ar1h <- function(rho, sds){
s <- matrix(sds)
g <- length(sds)
rho_init <- rho ^ as.matrix(dist(1:g)) # autoregression
cov_init <- s %*% t(s) # heterogeneity
return(rho_init * cov_init)
}
# Function for data generation
gen_data <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, timetype, empirical){
library(MASS) # use of library() inside function for parallellization
library(dplyr) # purposes (maybe there are ways to circumvent this)
library(tidyr)
covmat1 <- gen_cov_ar1h(rho1, sds1)
covmat2 <- gen_cov_ar1h(rho2, sds2)
ds1 <- data.frame(mvrnorm(n = n1,
mu = ms1,
Sigma = covmat1,
empirical = empirical))
ds2 <- data.frame(mvrnorm(n = n2,
mu = ms2,
Sigma = covmat2,
empirical = empirical))
names(ds1) <- names(ds2) <- paste0(rep("time", length(ms1)),
0:(length(ms1)-1))
ds1$treatment <- "control"
ds2$treatment <- "experimental"
ds1$id <- 1:n1
ds2$id <- (n1+1):(n1+n2)
ds <- rbind(ds1,ds2)
ds_long <- ds %>%
pivot_longer(cols = -c(id, treatment),
names_to = "time",
names_prefix = "time",
values_to = "y") %>%
mutate(treatment = as.factor(treatment),
treatment = relevel(treatment, ref = "control"),
time = timetype(time))
return(ds_long)
}
# Function for applying method to generated data
inner_sim <- function(n1, n2, sds1, sds2, ms1, ms2, rho1, rho2, seeds, seednr, method = "gls", timetype = as.numeric){
library(nlme)
set.seed(10000 * seeds[seednr]) # setting seed inside inner loop to prevent differences between Mersenne-Twister and L'Ecuyer-CMRG
ds_long <- gen_data(n1 = n1,
n2 = n2,
sds1 = sds1,
sds2 = sds2,
ms1 = ms1,
ms2 = ms2,
rho1 = rho1,
rho2 = rho2,
timetype = timetype,
empirical = FALSE)
if(method == "lme") out <- tryCatch(anova(lme(fixed = y~time*treatment,
data = ds_long,
random = ~1+time|id,
method = "REML"))["time:treatment","p-value"],
error = function(x) NA)
if(method == "gls") out <- tryCatch(anova(gls(model = y~time*treatment,
data = ds_long,
correlation = corAR1(form=~1|id),
weights = varIdent(form=~1|time),
method = "REML"))["time:treatment","p-value"],
error = function(x) NA)
return(out)
}
# Function for performing data generation and analysis, sequential
outer_sim_s <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
method = "gls", timetype = as.numeric, n.iter = 100){
outmat <- matrix(NA, nrow = n.iter, ncol = length(n1s))
n2s <- round(n1s * nratio)
seeds <- runif(n.iter,0,1)
pb <- txtProgressBar(min = 0,
max = length(n1s),
style = 3)
for(i in 1:length(n1s)){
for(j in 1:n.iter){
outmat[j, i] <- inner_sim(n1s[i], n2s[i], sds1, sds2, ms1, ms2, rho1, rho2, seeds, j, method, timetype)
}
setTxtProgressBar(pb,i)
}
outmat <- data.frame(outmat)
names(outmat) <- paste0("n_",n1s,"_",n2s)
return(outmat)
}
# Function for performing data generation and analysis, parallellized
outer_sim_p <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 2){
outmat <- matrix(NA, nrow = n.iter, ncol=length(n1s))
n2s <- round(n1s * nratio)
pb <- txtProgressBar(min = 0,
max = length(n1s),
style = 3)
for(i in 1:length(n1s)){
n1 <- n1s[i]
n2 <- n2s[i]
cl <- makeCluster(n.cores)
seeds <- runif(n.iter,0,1)
clusterExport(cl = cl,
varlist = c("n1", "n2", "sds1", "sds2", "ms1", "ms2", "rho1", "rho2", "seeds",
"method", "timetype", "inner_sim", "gen_cov_ar1h", "gen_data"),
envir = environment())
splitclusters <- 1:n.iter
par_out <- parSapplyLB(cl = cl,
X = splitclusters,
FUN = function(x) inner_sim(n1, n2, sds1, sds2, ms1, ms2, rho1,
rho2, seeds, x, method, timetype))
outmat[,i] <- t(par_out)
stopCluster(cl)
setTxtProgressBar(pb,i)
}
outmat <- data.frame(outmat)
names(outmat) <- paste0("n_",n1s,"_",n2s)
return(outmat)
}
# Wrapper function
sim.multilevel.RCT <- function(n1s, nratio = 1, sds1, sds2, ms1, ms2, rho1, rho2,
method = "gls", timetype = as.numeric, n.iter = 100, n.cores = 1){
if(n.cores == 1) {
out <- outer_sim_s(n1s, nratio, sds1, sds2, ms1, ms2,
rho1, rho2, method, timetype, n.iter)
} else if(n.cores > 1) {
out <- outer_sim_p(n1s, nratio, sds1, sds2, ms1, ms2,
rho1, rho2, method, timetype, n.iter, n.cores)
} else {
stop("n.cores should be >= 1")
}
return(out)
}
######################################################
# A testrun with made-up means, variances and rhos: #
######################################################
tic()
set.seed(3)
d <- sim.multilevel.RCT(n1s = c(30:40),
nratio = 1,
ms1 = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0),
sds1 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
rho1 = .7,
ms2 = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2),
sds2 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
rho2 = .7,
method = "lme",
timetype = as.numeric,
n.iter = 1000,
n.cores = 16)
toc()
# Power for given sample sizes
colMeans(d < .05, na.rm=TRUE) # columns are named "n_<<sample size group 1>>_<<sample size group 2>>
# Generate data for desired sample size from above. Use empirical = TRUE for exact results
q <- gen_data(n1 = 37,
n2 = 37,
sds1 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
sds2 = c(5.5, 5.2, 4.8, 4.2, 3.8, 4.2, 4.8, 5.2, 5.5),
ms1 = c(20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0),
ms2 = c(20.0, 19.4, 18.8, 18.2, 17.6, 17.0, 16.4, 15.8, 15.2),
rho1 = .7,
rho2 = .7,
timetype = as.numeric,
empirical = TRUE)
# Run a mixed effects model and interpret coefficients and p-values
summary(lme(y~treatment*time,random=~1+time|id, data=q, method="REML"))
anova.lme(lme(y~treatment*time,random=~1+time|id, data=q, method="REML"))
# Plot the data
q %>%
group_by(time, treatment) %>% # filter(treatment=="experimental") %>%
mutate(mn_y_time_treat = mean(y),
y_ci_ul = mean(y) + 1.96 * sd(y),
y_ci_ll = mean(y) - 1.96 * sd(y)) %>%
ggplot(aes(x=time, y=y, col=treatment)) +
geom_point(position=position_dodge(width=.2)) +
geom_line(aes(x=time, y=mn_y_time_treat), size = 1.5) +
geom_ribbon(aes(x=time*1.04-.15,ymin=y_ci_ll, ymax=y_ci_ul, fill = treatment), alpha=.1) +
scale_x_continuous(breaks = 0:8, labels = 0:8) +
theme_classic()
# Let's look at effect sizes for difference scores (wrt baseline) at each timepoint
q %>%
pivot_wider(id_cols = c(id, treatment),
names_from = time,
names_prefix = "time",
values_from = y) %>%
mutate(diff1 = time0 - time1,
diff2 = time0 - time2,
diff3 = time0 - time3,
diff4 = time0 - time4,
diff5 = time0 - time5,
diff6 = time0 - time6,
diff7 = time0 - time7,
diff8 = time0 - time8) -> q2
ds <- rep(NA,8)
ds[1] <- cohen.d(diff1~treatment, data=q2)$estimate
ds[2] <- cohen.d(diff2~treatment, data=q2)$estimate
ds[3] <- cohen.d(diff3~treatment, data=q2)$estimate
ds[4] <- cohen.d(diff4~treatment, data=q2)$estimate
ds[5] <- cohen.d(diff5~treatment, data=q2)$estimate
ds[6] <- cohen.d(diff6~treatment, data=q2)$estimate
ds[7] <- cohen.d(diff7~treatment, data=q2)$estimate
ds[8] <- cohen.d(diff8~treatment, data=q2)$estimate
cbind(ds)
@mathijsdeen
Copy link
Author

mathijsdeen commented Aug 27, 2021

Arguments for sim.multilevel.RCT:

  • n1s: a vector of sample sizes for the control group.
  • nratio: how much bigger should the experimental be, compared to the control group?
  • sds1, sds2: a vector standard deviations at each timepoint for groups 1 (control) and 2 (experimental).
  • ms1, ms2: a vector of means at each timepoint for groups 1 (control) and 2 (experimental).
  • rho1, rho2: autocorrelation for groups 1 (control) and 2 (experimental).
  • method: which method will be used to analyse the data. Options are:
    • "lme" for linear mixed effects
    • "gls" for covariance pattern models using generalized least squares)
  • timetype: function, either as.numeric or as.factor, indicating how the time variable should be included in the analysis.
  • n.iter: how many simulation iterations within each sample size configuration?
  • n.cores: how many CPU cores are to be used in the simulation?

@mathijsdeen
Copy link
Author

mathijsdeen commented Aug 27, 2021

When applying the code to new data, some things might need to be altered to suit your situation. For example:

  • When setting up the simulation study:
    • Within inner_sim, the code above retrieves the p-value of time:treatment from the anova. Power is calculated from this interaction effect. If your main interest is in some other parameter (e.g., power for an interaction at a specific timepoint with time as a factor), changes to this part of the code are needed.
    • The analyses in the simulation assume linearity (when timetype is set to as.numeric). Change the model in lme(..) and/or gls(..) if you want to introduce curvature.
    • If you don't want to assume a heterogeneous autoregressive covariance structure, change or replace the gen_cov_arh1 function and/or consider the specification of sds1 and sds2 according to your assumptions. For example:
      • For a homogeneous first-order autoregressive covariance structure, use invariant sds1 and sds2. After all, you are assuming that the standard deviations are not changing over time.
      • For a heterogeneous compound symmetric covariance structure, use rho_init <- rho ^ as.matrix(1-1*diag(g)) within gen_cov_arh1.
      • For a homogeneous compound symmetric covariance structure, combine rho_init <- rho ^ as.matrix(1-1*diag(g)) with invariant sds1 and sds2.
  • When plotting the data:
    • The width within position_dodge was optimized for the example above, viewed within a regularly sized RStudio Plots pane on a full HD monitor.
    • The argument x=time*1.04-.15 within geom_ribbon was optimized for the given example as well. This alteration of the time value in geom_ribbon(aes(x=..)) was chosen to include the dodged dots at the first and last timepoints within the ribbon. Merely esthetics...
    • For the breaks and labels arguments in scale_x_continuous, the 8 should be replaced with the maximum timepoint number in your situation.
  • When investigating the effect sizes:
    • The number of difference scores might be different in your situation.
    • This goes for the number of effect sizes as well, obviously.
    • Not done in the code above, but you might want to do these computations within a loop. Duh.

@mathijsdeen
Copy link
Author

Please note that functionality is mainly tested with the default timetype = as.numeric.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment