Skip to content

Instantly share code, notes, and snippets.

@metamaden
Last active July 9, 2024 02:53
Show Gist options
  • Save metamaden/953b128c230e32ef1beecbc2695e93c1 to your computer and use it in GitHub Desktop.
Save metamaden/953b128c230e32ef1beecbc2695e93c1 to your computer and use it in GitHub Desktop.
Modified DNA methylation power analysis function for user-specified data inputs
#!/usr/bin/env R
# Author: Sean Maden
#
# *** Description ***
# This is a sligthly modified form of the pwrEWAS power analysis function from the pwrEWAS R package.
#
# Source this function from an active R session using:
# > devtools::source_gist("https://gist.github.com/metamaden/953b128c230e32ef1beecbc2695e93c1")
#
# *** References ***
#
# 1. This code was used in the manuscript:
#
# Maden SK* , Walsh B, Ellrott K, Hansen KD, Thompson RF, Nellore A. “Recountmethylation enables flexible
# analysis of public blood DNA methylation array data.” Bioinformatics Advances 3, 1. 2023.
#
# 2. Original source code for pwrEWAS() function:
# https://github.com/stefangraw/pwrEWAS/blob/master/R/pwrEWAS_v1.8.R
#
#
#
#' @title pwrEWAS_itable - A computationally efficient tool for comprehensive power estimation in EWAS
#'
#' @description pwrEWAS is a computationally efficient tool to estimate power in EWAS as a function of sample and effect size for two-group comparisons of DNAm (e.g., case vs control, exposed vs non-exposed, etc.). Detailed description of in-/outputs, instructions and an example, as well as interpretations of the example results are provided in the vignette: vignette("pwrEWAS")
#'
#' @param tissueType Tissue type DNA methylation data, containing columns "mu" for Beta-value means, and "var" for Beta-value variances.
#' @param minTotSampleSize Minimum total sample size.
#' @param maxTotSampleSize Maximum total sample size.
#' @param SampleSizeSteps Sample size increments.
#' @param NcntPer Percentage sample group 1 (control group) (NcntPer = 0.5 indicates a balanced design).
#' @param targetDelta Target maximum difference in mean DNAm. (Either 'targetDelta' or 'deltaSD' should be specified)
#' @param deltaSD Standard deviation of simulated differences. (Either 'targetDelta' or 'deltaSD' should be specified)
#' @param J Number of CpGs tested/simulated (default: 100000).
#' @param targetDmCpGs Target number of DM CpGs.
#' @param detectionLimit Smallest detectable difference in DNAm (default: 0.01).
#' @param DMmethod Method of Differential Methylation analysis: "limma" (default), "t-test (unequal var)", "t-test (equal var)", "Wilcox rank sum", "CPGassoc".
#' @param FDRcritVal FDRcritVal (default: 0.05).
#' @param core Number of threads for multi-threading (default: 1).
#' @param sims Number of simulated data sets (default: 50).
#'
#' @return pwrEWAS will return an object with the following four attributes: meanPower, powerArray, deltaArray, and metric, where metric contains marTypeI, classicalPower, FDR, and FDC
#'
#' @keywords DNAm microarray power
#'
#' @export
#'
#' @examples
#' library(minfiData)
#' data("MsetEx") # load MethylSet
#' ms <- MsetEx
#' bval <- getBeta(ms) # get DNAm fractions
#' # get the summary data frame
#' dfpwr <- data.frame(mu = rowMeans(bval, na.rm = T),
#' var = rowVars(bval, na.rm = T))
#'
#' # get power analysis results
#' outDelta <- pwrEWAS_itable(
#' tissueType = dfpwr,
#' minTotSampleSize = 10,
#' maxTotSampleSize = 20,
#' SampleSizeSteps = 10,
#' NcntPer = 0.5,
#' targetDelta = c(0.2, 0.5),
#' J = 1000,
#' targetDmCpGs = 10,
#' detectionLimit = 0.01,
#' DMmethod = "limma",
#' FDRcritVal = 0.05,
#' core = 2,
#' sims = 30)
#'
#' outSD <- pwrEWAS(
#' tissueType = dfpwr,
#' minTotSampleSize = 10,
#' maxTotSampleSize = 20,
#' SampleSizeSteps = 10,
#' NcntPer = 0.5,
#' deltaSD = c(0.02, 0.03),
#' J = 1000,
#' targetDmCpGs = 10,
#' detectionLimit = 0.01,
#' DMmethod = "limma",
#' FDRcritVal = 0.05,
#' core = 2,
#' sims = 30)
pwrEWAS_itable <- function(
tissueType, # dna methylation data means and variances
minTotSampleSize, # min total sample size
maxTotSampleSize, # max total sample size
SampleSizeSteps, # steps for increasing total sample size
NcntPer, # percentage of control sample
targetDelta = NULL, # vector of 99 percentile of the target max DM
deltaSD = NULL, # vector of sd(delta)
J = 100000, # number of simulated CpGs
targetDmCpGs, # target number for truely differentially methylated CpG
detectionLimit = 0.01, # lower bound for differential methylated CpG's to be considered TRULY differential methylated
DMmethod = c("limma", "t-test (unequal var)", "t-test (equal var)", "Wilcox rank sum", "CPGassoc"), # method to detect differential methylation
FDRcritVal = 0.05,
core = 1, # number of cores to multi thread
sims = 50
){
methPara <- tissueType
DMmethod <- match.arg(DMmethod)
if(!is.null(targetDelta) & !is.null(deltaSD)) stop("Please specify only one: 'targetDelta' or 'deltaSD'")
#load data
#methPara <- pwrEWAS.data:::loadDataset(tissueType)
# methPara <- ExperimentHub::ExperimentHub()[["EH3078"]]
CpGonArray <- length(methPara$mu)
output <- NULL
totSampleSizes <- seq(minTotSampleSize, maxTotSampleSize, SampleSizeSteps)
# initalize multi thread clusters
cl <- parallel::makeCluster(core) # multi core
doSNOW::registerDoSNOW(cl)
# doParallel::registerDoParallel(cl) # multi core
# combining function for foreach loops
combine_tau <- function(listA, listB){
if(is.null(listA)) return(listB)
if(!methods::is(listA[["power"]], "array") & !methods::is(listA[["power"]], "matrix")) listA[["power"]] <- matrix(listA[["power"]])
if(!methods::is(listB[["power"]], "array") & !methods::is(listB[["power"]], "matrix")) listB[["power"]] <- matrix(listB[["power"]])
if(!methods::is(listA[["metric"]]$marTypeI, "array") & !methods::is(listA[["metric"]]$marTypeI, "matrix")) listA[["metric"]]$marTypeI <- matrix(listA[["metric"]]$marTypeI)
if(!methods::is(listB[["metric"]]$marTypeI, "array") & !methods::is(listB[["metric"]]$marTypeI, "matrix")) listB[["metric"]]$marTypeI <- matrix(listB[["metric"]]$marTypeI)
if(!methods::is(listA[["metric"]]$classicalPower, "array") & !methods::is(listA[["metric"]]$classicalPower, "matrix")) listA[["metric"]]$classicalPower <- matrix(listA[["metric"]]$classicalPower)
if(!methods::is(listB[["metric"]]$classicalPower, "array") & !methods::is(listB[["metric"]]$classicalPower, "matrix")) listB[["metric"]]$classicalPower <- matrix(listB[["metric"]]$classicalPower)
if(!methods::is(listA[["metric"]]$FDR, "array") & !methods::is(listA[["metric"]]$FDR, "matrix")) listA[["metric"]]$FDR <- matrix(listA[["metric"]]$FDR)
if(!methods::is(listB[["metric"]]$FDR, "array") & !methods::is(listB[["metric"]]$FDR, "matrix")) listB[["metric"]]$FDR <- matrix(listB[["metric"]]$FDR)
if(!methods::is(listA[["metric"]]$FDC, "array") & !methods::is(listA[["metric"]]$FDC, "matrix")) listA[["metric"]]$FDC <- matrix(listA[["metric"]]$FDC)
if(!methods::is(listB[["metric"]]$FDC, "array") & !methods::is(listB[["metric"]]$FDC, "matrix")) listB[["metric"]]$FDC <- matrix(listB[["metric"]]$FDC)
if(!methods::is(listA[["metric"]]$probTP, "array") & !methods::is(listA[["metric"]]$probTP, "matrix")) listA[["metric"]]$probTP <- matrix(listA[["metric"]]$probTP)
if(!methods::is(listB[["metric"]]$probTP, "array") & !methods::is(listB[["metric"]]$probTP, "matrix")) listB[["metric"]]$probTP <- matrix(listB[["metric"]]$probTP)
if(!methods::is(listA[["delta"]], "list")) listA[["delta"]] <- list(listA[["delta"]])
returnList <- list()
returnList[["power"]] <- abind::abind(listA[["power"]], listB[["power"]], along = 3)
returnList[["delta"]] <- listA[["delta"]]
returnList[["delta"]][[length(listA[["delta"]])+1]] <- listB[["delta"]]
returnList[["metric"]]$marTypeI <- abind::abind(listA[["metric"]]$marTypeI, listB[["metric"]]$marTypeI, along = 3)
returnList[["metric"]]$classicalPower <- abind::abind(listA[["metric"]]$classicalPower, listB[["metric"]]$classicalPower, along = 3)
returnList[["metric"]]$FDR <- abind::abind(listA[["metric"]]$FDR, listB[["metric"]]$FDR, along = 3)
returnList[["metric"]]$FDC <- abind::abind(listA[["metric"]]$FDC, listB[["metric"]]$FDC, along = 3)
returnList[["metric"]]$probTP <- abind::abind(listA[["metric"]]$probTP, listB[["metric"]]$probTP, along = 3)
return(returnList)
}
combine_totSampleSizes <- function(listA, listB){
if(is.null(listA)) return(listB)
returnList <- list()
returnList[["power"]] <- cbind(listA[["power"]], listB[["power"]])
returnList[["delta"]] <- cbind(listA[["delta"]], listB[["delta"]])
returnList[["metric"]]$marTypeI <- cbind(listA[["metric"]]$marTypeI, listB[["metric"]]$marTypeI)
returnList[["metric"]]$classicalPower <- cbind(listA[["metric"]]$classicalPower, listB[["metric"]]$classicalPower)
returnList[["metric"]]$FDR <- cbind(listA[["metric"]]$FDR, listB[["metric"]]$FDR)
returnList[["metric"]]$FDC <- cbind(listA[["metric"]]$FDC, listB[["metric"]]$FDC)
returnList[["metric"]]$probTP <- cbind(listA[["metric"]]$probTP, listB[["metric"]]$probTP)
return(returnList)
}
if(is.null(deltaSD)){
# finding tau and K for target DM CpG's
cat(paste("[",Sys.time(),"] ", "Finding tau...", sep = ""))
K <- NULL
tau <- NULL
for(d in seq_along(targetDelta)){
myTau <- getTau(targetDmCpGs, targetDelta[d], methPara, detectionLimit, J, CpGonArray)
tau[d] <- myTau$tau
K[d] <- myTau$K # number of changed CpGs
}
cat(paste("done", " [",Sys.time(),"]\n", sep = ""))
print(paste("The following taus were chosen: ", paste(tau, collapse = ", "), sep = ""))
} else {
tau <- deltaSD
K <- NULL
for(d in seq_along(tau)){
K[d] <- getK(targetDmCpGs, methPara, detectionLimit, J, CpGonArray, tau)
}
}
# main function
startTime <- Sys.time()
cat(paste("[",startTime,"] ", "Running simulation\n", sep = ""))
iterations <- length(totSampleSizes) * length(tau)
pb <- utils::txtProgressBar(max = iterations, style = 3)
progress <- function(n) utils::setTxtProgressBar(pb, n)
opts <- list(progress = progress)
Ntot <- NULL
multiThreadOut <- foreach(d = seq_along(tau),
.combine = combine_tau,
.packages = c("truncnorm", "limma", "CpGassoc", "genefilter"),
.export = c("getAlphBet", "getMeanVar", "beta2Mvalue", "limma", "ttestSlow", "ttestFast", "Wilcox", "CPGassoc")) %:%
foreach(Ntot = totSampleSizes, .combine = combine_totSampleSizes, .options.snow = opts) %dopar% {
utils::setTxtProgressBar(pb, (d-1)*length(totSampleSizes) + which(Ntot==totSampleSizes))
Ncnt <- round(Ntot * NcntPer)
Ntx <- Ntot - Ncnt
marPower <- NULL
deltaSim <- NULL
marTypeI <- NULL
FDR <- NULL
classicalPower <- NULL
FDC <- NULL
probTP <- NULL
for(sim in seq_len(sims)){
## sample CpGs
cpgIdx <- sample(x = seq_len(CpGonArray), size = J, replace = TRUE) # pick J random CpG's to be simulated
cpgIdxName <- paste(seq_len(J), "_", rownames(methPara)[cpgIdx], sep = "") # ensuring unique CpG name (allowing unique sampling with replacement)
changedCpgsIdx <- sample(x = cpgIdx, size = K[d]) # pick K random CpG's to be changed in mean meth
changedCpgsIdxName <- cpgIdxName[match(changedCpgsIdx, cpgIdx)]
## Change Mu for "changedCpgsIdx"'s CpG's
# drawing delta from truncated normal
delta <- truncnorm::rtruncnorm(1, mean = 0, sd = as.numeric(tau[d]),
a=0.5 - methPara$mu[changedCpgsIdx] - sqrt(0.25-methPara$var[changedCpgsIdx]),
b=0.5 - methPara$mu[changedCpgsIdx] + sqrt(0.25-methPara$var[changedCpgsIdx]))
deltaSim <- c(deltaSim, delta) # multi core
meaningfulDM <- (abs(delta) >= detectionLimit)
meaningfulDMName <- changedCpgsIdxName[meaningfulDM]
# changing mean methylation for specific CpG's (changedCpgsIdx)
muToBeSimuUNchanged <- methPara$mu[cpgIdx]
muToBeSimuChanged <- methPara$mu[cpgIdx]
muToBeSimuChanged[match(changedCpgsIdx, cpgIdx)] <- muToBeSimuChanged[match(changedCpgsIdx, cpgIdx)] + delta
## get alpha and beta
params_unchanged <- getAlphBet(myMean = muToBeSimuUNchanged, myVar = methPara$var[cpgIdx])
alpha_unchanged <- params_unchanged$alpha
beta_unchanged <- params_unchanged$beta
params_changed <- getAlphBet(myMean = muToBeSimuChanged, myVar = methPara$var[cpgIdx])
alpha_changed <- params_changed$alpha
beta_changed <- params_changed$beta
## simulate baseline beta values
g1Beta <- NULL
g2Beta <- NULL
g1Beta <- matrix(stats::rbeta(J*Ncnt, rep(alpha_unchanged, each = Ncnt), rep(beta_unchanged, each = Ncnt)), ncol = Ncnt, byrow = TRUE)
g2Beta <- matrix(stats::rbeta(J*Ntx, rep(alpha_changed, each = Ntx), rep(beta_changed, each = Ntx)), ncol = Ntx, byrow = TRUE)
g1Beta[g1Beta == 1] <- max(g1Beta[g1Beta != 1]) # replacing 0/1 by min/max
g2Beta[g2Beta == 1] <- max(g2Beta[g2Beta != 1])
g1Beta[g1Beta == 0] <- min(g1Beta[g1Beta != 0])
g2Beta[g2Beta == 0] <- min(g2Beta[g2Beta != 0])
rownames(g1Beta) <- rownames(g2Beta) <- paste(seq_len(J),"_",names(alpha_unchanged),sep = "")
# Tests
DMtest <- switch(DMmethod,
"t-test (unequal var)" = ttestSlow(g1Beta,g2Beta,Ncnt,Ntx,paired=FALSE), # t-test slow (unequal var)
"t-test (equal var)" = ttestFast(g1Beta,g2Beta,Ncnt,Ntx), # t-test (equal var)
"CPGassoc" = CPGassoc(g1Beta, g2Beta, Ncnt,Ntx), # CPG assoc
"Wilcox rank sum" = Wilcox(g1Beta, g2Beta, Ncnt,Ntx), # Wilcox rank sum test
"limma" = limma(g1Beta, g2Beta, Ncnt,Ntx), # limma
stop("Test not found")
)
# table from paper
notDM <- cpgIdxName[!(cpgIdxName %in% changedCpgsIdxName)]
DM_negligible <- changedCpgsIdxName[!(changedCpgsIdxName %in% meaningfulDMName)]
DM_meaningful <- changedCpgsIdxName[changedCpgsIdxName %in% meaningfulDMName]
FP <- intersect(cpgIdxName[DMtest$fdr<FDRcritVal], notDM) # FP
NP <- intersect(cpgIdxName[DMtest$fdr<FDRcritVal], DM_negligible) # NP
TP <- intersect(cpgIdxName[DMtest$fdr<FDRcritVal], DM_meaningful) # TP
detectedCpGs <- cpgIdxName[DMtest$fdr<FDRcritVal]
TN <- intersect(cpgIdxName[!(DMtest$fdr<FDRcritVal)], notDM) # TN
NN <- intersect(cpgIdxName[!(DMtest$fdr<FDRcritVal)], DM_negligible) # NN
FN <- intersect(cpgIdxName[!(DMtest$fdr<FDRcritVal)], DM_meaningful) # FN
## metrics
marPower[sim] <- ifelse(length(DM_meaningful) > 0, length(TP)/length(DM_meaningful), NA)
marTypeI[sim] <- ifelse(length(notDM) > 0, length(FP)/length(notDM), NA)
FDR[sim] <- ifelse(length(detectedCpGs) > 0, length(FP)/length(detectedCpGs), NA)
FDC[sim] <- ifelse(length(TP) > 0, (length(FP))/length(TP), NA) # expected false discovery for each true discovery
classicalPower[sim] <- (length(NP)+length(TP))/(length(DM_negligible)+length(DM_meaningful))
probTP[sim] <- ifelse(length(TP)>0, 1, 0)
} # end sim
outSim <- list()
outSim[["power"]] <- marPower # multi core
outSim[["delta"]] <- deltaSim # multi core
outSim[["metric"]]$marTypeI <- marTypeI
outSim[["metric"]]$FDR <- FDR
outSim[["metric"]]$classicalPower <- classicalPower
outSim[["metric"]]$FDC <- FDC
outSim[["metric"]]$probTP <- probTP
outSim
} # end tau and totSampleSizes
close(pb)
parallel::stopCluster(cl)
cat(paste("[",startTime,"] Running simulation ... done [",Sys.time(),"]\n", sep = ""))
# dirty fix
if(is.null(targetDelta)) targetDelta <- tau
# calculate mean marginal power and adding names
if(length(targetDelta) == 1 & length(totSampleSizes) == 1) output$meanPower <- matrix(mean(multiThreadOut[["power"]], na.rm=TRUE))
if(length(targetDelta) == 1 & length(totSampleSizes) > 1) output$meanPower <- matrix(apply(multiThreadOut[["power"]], 2, mean, na.rm=TRUE), ncol = 1)
if(length(targetDelta) > 1) output$meanPower <- apply(multiThreadOut[["power"]], c(2,3), mean, na.rm=TRUE)
rownames(output$meanPower) <- totSampleSizes
colnames(output$meanPower) <- targetDelta
# powerArray
if(length(targetDelta) == 1) output$powerArray <- array(data = multiThreadOut[["power"]], dim = c(sims, length(totSampleSizes), length(targetDelta)))
if(length(targetDelta) > 1 & length(totSampleSizes) == 1) output$powerArray <- multiThreadOut[["power"]]
if(length(targetDelta) > 1) output$powerArray <- multiThreadOut[["power"]]
dimnames(output$powerArray) <- list(seq_len(sims), totSampleSizes, targetDelta)
# deltaArray
if(length(targetDelta) == 1 & length(totSampleSizes) == 1) output$deltaArray <- list(matrix(multiThreadOut[["delta"]]))
if(length(targetDelta) == 1 & length(totSampleSizes) > 1) output$deltaArray <- list(multiThreadOut[["delta"]])
if(length(targetDelta) > 1 & length(totSampleSizes) == 1) output$deltaArray <- lapply(multiThreadOut[["delta"]],as.matrix)
if(length(targetDelta) > 1 & length(totSampleSizes) > 1) output$deltaArray <- multiThreadOut[["delta"]]
names(output$deltaArray) <- targetDelta
for(d in seq_along(targetDelta)){
colnames(output$deltaArray[[d]]) <- totSampleSizes
}
# calculate mean marTypeI and adding names
if(length(targetDelta) == 1 & length(totSampleSizes) == 1) output$metric$marTypeI <- matrix(mean(multiThreadOut[["metric"]]$marTypeI, na.rm=TRUE))
if(length(targetDelta) == 1 & length(totSampleSizes) > 1) output$metric$marTypeI <- matrix(apply(multiThreadOut[["metric"]]$marTypeI, 2, mean, na.rm=TRUE), ncol = 1)
if(length(targetDelta) > 1) output$metric$marTypeI <- apply(multiThreadOut[["metric"]]$marTypeI, c(2,3), mean, na.rm=TRUE)
rownames(output$metric$marTypeI) <- totSampleSizes
colnames(output$metric$marTypeI) <- targetDelta
# calculate mean classicalPower and adding names
if(length(targetDelta) == 1 & length(totSampleSizes) == 1) output$metric$classicalPower <- matrix(mean(multiThreadOut[["metric"]]$classicalPower, na.rm=TRUE))
if(length(targetDelta) == 1 & length(totSampleSizes) > 1) output$metric$classicalPower <- matrix(apply(multiThreadOut[["metric"]]$classicalPower, 2, mean, na.rm=TRUE), ncol = 1)
if(length(targetDelta) > 1) output$metric$classicalPower <- apply(multiThreadOut[["metric"]]$classicalPower, c(2,3), mean, na.rm=TRUE)
rownames(output$metric$classicalPower) <- totSampleSizes
colnames(output$metric$classicalPower) <- targetDelta
# calculate mean FDR and adding names
if(length(targetDelta) == 1 & length(totSampleSizes) == 1) output$metric$FDR <- matrix(mean(multiThreadOut[["metric"]]$FDR, na.rm=TRUE))
if(length(targetDelta) == 1 & length(totSampleSizes) > 1) output$metric$FDR <- matrix(apply(multiThreadOut[["metric"]]$FDR, 2, mean, na.rm=TRUE), ncol = 1)
if(length(targetDelta) > 1) output$metric$FDR <- apply(multiThreadOut[["metric"]]$FDR, c(2,3), mean, na.rm=TRUE)
rownames(output$metric$FDR) <- totSampleSizes
colnames(output$metric$FDR) <- targetDelta
# calculate mean FDC and adding names
if(length(targetDelta) == 1 & length(totSampleSizes) == 1) output$metric$FDC <- matrix(mean(multiThreadOut[["metric"]]$FDC, na.rm=TRUE))
if(length(targetDelta) == 1 & length(totSampleSizes) > 1) output$metric$FDC <- matrix(apply(multiThreadOut[["metric"]]$FDC, 2, mean, na.rm=TRUE), ncol = 1)
if(length(targetDelta) > 1) output$metric$FDC <- apply(multiThreadOut[["metric"]]$FDC, c(2,3), mean, na.rm=TRUE)
rownames(output$metric$FDC) <- totSampleSizes
colnames(output$metric$FDC) <- targetDelta
# calculate probability of detecting at least one TP and adding names
if(length(targetDelta) == 1 & length(totSampleSizes) == 1) output$metric$probTP <- matrix(mean(multiThreadOut[["metric"]]$probTP, na.rm=TRUE))
if(length(targetDelta) == 1 & length(totSampleSizes) > 1) output$metric$probTP <- matrix(apply(multiThreadOut[["metric"]]$probTP, 2, mean, na.rm=TRUE), ncol = 1)
if(length(targetDelta) > 1) output$metric$probTP <- apply(multiThreadOut[["metric"]]$probTP, c(2,3), mean, na.rm=TRUE)
rownames(output$metric$probTP) <- totSampleSizes
colnames(output$metric$probTP) <- targetDelta
return(output)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment