Skip to content

Instantly share code, notes, and snippets.

@eschen42
Last active August 9, 2023 17:26
Show Gist options
  • Save eschen42/ce226d4f924dae9e0e4bbf87ca8f6b0b to your computer and use it in GitHub Desktop.
Save eschen42/ce226d4f924dae9e0e4bbf87ca8f6b0b to your computer and use it in GitHub Desktop.
Get unadjusted and adjusted p-values from LCT-B and LCT-N (see [comment](https://gist.github.com/eschen42/ce226d4f924dae9e0e4bbf87ca8f6b0b?permalink_comment_id=4653172#gistcomment-4653172))
# LCTboot_inspect - TestCor::LCTboot modified:
# - to return a "reportables" data.frame as an attribute of the result
# - (for estimating the adjusted and unadjusted p-values)
# - to return the function environment as an attribute of the result
# - (for debugging reportables)
# - to use the normal approximation when Nboot == 0
# - (so that the same code base may be used for LCT-N and LCT-B)
# ref: https://github.com/cran/TestCor/blob/9d5d2a9a1ac284e3346bd144776559cbd82a8b52/R/FdrMethods.R
#' Bootstrap procedure LCT-B proposed by Cai & Liu (2016) for correlation testing.
#'
#' @param data matrix of observations
#' @param alpha level of multiple testing
#' @param stat_test
#' \describe{
#' \item{'empirical'}{\eqn{\sqrt{n}*abs(corr)}}
#' \item{'fisher'}{\eqn{\sqrt{n-3}*1/2*\log( (1+corr)/(1-corr) )}}
#' \item{'student'}{\eqn{\sqrt{n-2}*abs(corr)/\sqrt(1-corr^2)}}
#' \item{'2nd.order'}{\eqn{\sqrt{n}*mean(Y)/sd(Y)} with \eqn{Y=(X_i-mean(X_i))(X_j-mean(X_j))}}
#' }
#' @param Nboot number of iterations for bootstrap quantile evaluation
#' @param vect if TRUE returns a vector of TRUE/FALSE values, corresponding to \code{vectorize(cor(data))};
#' if FALSE, returns an array containing TRUE/FALSE values for each entry of the correlation matrix
#' @param arr.ind if TRUE, returns the indexes of the significant correlations, with respect to level alpha
#'
#' @return Returns \itemize{\item{logicals, equal to TRUE if the corresponding element of the statistic vector is rejected, as a vector or a matrix depending of the value of \code{vect},} \item{an array containing indexes \eqn{\lbrace(i,j),\,i<j\rbrace} for which correlation between variables \eqn{i} and \eqn{j} is significant, if \code{arr.ind=TRUE}.}}
#'
#' @importFrom stats cor quantile var
#'
#' @export
#' @seealso TestCor::ApplyFdrCor, TestCor::LCTNorm
#' @references Cai, T. T., & Liu, W. (2016). Large-scale multiple testing of correlations. Journal of the American Statistical Association, 111(513), 229-240.
#'
#' @examples
#' n <- 100
#' p <- 10
#' corr_theo <- diag(1,p)
#' corr_theo[1,3] <- 0.5
#' corr_theo[3,1] <- 0.5
#' data <- MASS::mvrnorm(n,rep(0,p),corr_theo)
#' alpha <- 0.05
#' # significant correlations:
#' LCTboot(data,alpha,stat_test='empirical',Nboot=100,arr.ind=TRUE)
LCTboot_inspect <-
function( # requires `TestCor`
input_data,
alpha = 0.05,
stat_test = '2nd.order',
Nboot = 100,
vect = FALSE,
arr.ind = FALSE
) {
if (sum(stat_test == c('empirical','fisher','student','2nd.order')) == 0) {
stop('Wrong value for stat_test.')
}
require(TestCor)
n <- nrow(input_data) # n = number of samples
p <- ncol(input_data) # p = number of proteins (variables)
p_names <- colnames(input_data)
# test statistic
stat_sign <- TestCor::eval_stat(input_data, stat_test)
stat <- abs(stat_sign)
stat_sign <- sign(stat_sign)
m <- length(stat)
# --------------------------------------------------------------------------
# These lines are the only changes to the original code (apart from format-
# ing) and do not change the result apart from preparing an attribute.
# This is a usability hack for the context attribute.
stat_unvectorized <- TestCor::unvectorize(stat)
rownames(stat_unvectorized) <-
colnames(stat_unvectorized) <- colnames(input_data)
# The following hack creates a map from index of unsorted stat to protein
stat_names_casted <- TestCor::unvectorize(seq_along(stat))
rownames(stat_names_casted) <-
colnames(stat_names_casted) <- colnames(input_data)
stat_names <- reshape2::melt(stat_names_casted)
trt_cor <- cor(input_data)
trt_cor_unvectorized <- trt_cor * (stat_unvectorized > 0)
rownames(trt_cor_unvectorized) <-
colnames(trt_cor_unvectorized) <- colnames(input_data)
stat_names$trt_cor <- reshape2::melt(trt_cor_unvectorized)$value
# eliminate the degenerate rows (i.e., having zero values)
stat_names <- stat_names[stat_names$value > 0, ]
stat_names <- stat_names[order(stat_names$value), , drop = FALSE]
# --------------------------------------------------------------------------
# sort the results in decreasing order of magnitude of test statistic
# this was: stat_sort <- sort(stat, index.return = TRUE, decreasing = TRUE)
stat_sort <-
data.frame(
T = stat,
ix = seq_len(m),
name = stat_names$value,
trt_cor = stat_names$trt_cor,
pval_alpha = rep.int(0, m),
sqrt_etc = rep.int(0, m),
res = rep.int(0, m),
stat_sign = stat_sign
)
stat_sort <- stat_sort[order(-stat), ]
if (Nboot > 0) {
# evaluation of pval by bootstrap
prop <- matrix(0, nrow = Nboot, ncol = m)
for (nboot in 1:Nboot) {
indb <- sample(seq(1, m, 1), replace = TRUE)
datab <- input_data[indb, ]
statb <- abs(eval_stat(datab, stat_test))
# comparison with stat test
for (i in 1:m) {
prop[nboot, i] <- mean(statb > stat_sort$T[i])
}
}
cd_of_T <- colMeans(prop)
} else {
# evaluation of p-value by normal approxiation
cd_of_T <- 2 * (1 - pnorm(stat_sort$T))
}
# --------------------------------------------------------------------------
# These lines are the only changes to the original code (apart from format-
# ing) and do not change the result apart from preparing an attribute.
stat_rank <- seq(1, m, 1)
stat_quantile <- stat_rank / m
stat_sort$cd_of_T <- cd_of_T
stat_sort$pval_alpha <- alpha - cd_of_T * stat_rank
stat_sort$sqrt_etc <- sqrt(4 * log(p) - 2 * log(log(p))) - stat_sort$T
# The next line is a paraphrase of:
# https://github.com/cran/TestCor/blob/9d5d2a9a1ac284e3346bd144776559cbd82a8b52/R/FdrMethods.R#L52C5-L52C98
stat_sort$ind <-
(cd_of_T <= alpha * stat_quantile) &
(stat_sort$T <= sqrt(4 * log(p) - 2 * log(log(p))))
# The unadjusted p-value is coded by reverse engineering the previous line.:
if (FALSE) { # not executed but not flagged by the code-in-comment linter
# Simplification for compound inequality:
# (1) Original expression, where:
# - n = number of samples
# - p = number of proteins (variables)
# - m = number of nondegenerate test statistics in upper triangle of
# the correlation matrix, i.e., (p ^ 2 - p) / 2
# - T = ranked test-statistic computed by TestCor::eval_stat
# - cd_T = cumulative distribution for (ranked) test-statistic T
# - stat_rank = rank of T
(cd_T < alpha * stat_quantile) & (T <= sqrt(4 * log(p) - 2 * log(log(p))))
# (2) Adjust for real cd_T and assumption that this is logical conjunction
(cd_T <= alpha * stat_quantile) && (T <= sqrt(4 * log(p) - 2 * log(log(p))))
# (3) Isolate alpha
(cd_T * m / stat_rank <= alpha) && (T <= sqrt(4 * log(p)- 2 * log(log(p))))
# (4) Product of lesser terms < product of greater terms,
# because T > 0 and sqrt(yadayada) > 0
T * cd_T * m / stat_rank <= alpha * sqrt(4 * log(p) - 2 * log(log(p)))
# (5) Solve for alpha
alpha >= T * cd_T * m / (stat_rank * sqrt(4 * log(p) - 2 * log(log(p))))
}
stat_sort$p_unadj <-
with(
stat_sort,
T * cd_of_T * m / (stat_rank * sqrt(4 * log(p) - 2 * log(log(p))))
)
stat_sort$stat_median <- with(stat_sort, median(T))
stat_sort$stat_mad <- with(stat_sort, mad(T))
z_mean <- mean(stat_sort$T)
z_sd <- sd(stat_sort$T)
stat_sort$z_score <- (stat_sort$T - z_mean) / z_sd
stat_sort$p_adj <- 2 * pnorm(stat_sort$T, mean = z_mean, sd = z_sd, lower.tail = FALSE)
# --------------------------------------------------------------------------
# truncated BH threshold
ind <- which(stat_sort$ind)
stat_cutoff <-
ifelse(
length(ind) == 0,
2 * sqrt(log(p)), # use normal approx. when forced by low alpha
stat_sort$T[max(ind)] # use T from distrib. when alpha large enough
)
res <- (stat >= stat_cutoff)
stat_sort$res <- (stat_sort$T >= stat_cutoff)
if (arr.ind) {
vect <- FALSE
}
if (!vect) {
res <- (unvectorize(res) > 0)
}
if (arr.ind) {
res <- whichCor(res)
}
# --------------------------------------------------------------------------
# These lines are the only changes to the original code (apart from format-
# ing) and do not change the result apart from preparing an attribute.
stat_resort <- stat_sort[order(stat_sort$ix), ]
p_unadj <- stat_resort$p_unadj
p_adj <- stat_resort$p_adj
if (is.null(p_unadj)) {
cat("stat_resort$p_unadj unexpectedly produced null\n", file = stderr())
}
# The "adjustment factor" relates the unadjusted p-value to the cutoff
reportables <-
data.frame(
var1 = as.character(stat_names[, "Var1"]),
var2 = as.character(stat_names[, "Var2"]),
ix = stat_resort$ix,
trt_cor = stat_resort$trt_cor,
test_statistic = stat_resort$T,
test_sign = stat_resort$stat_sign,
z_score = stat_resort$z_score,
z_relative = stat_resort$z_score / stat_resort$stat_median,
stat_median = stat_resort$stat_median,
stat_mad = stat_resort$stat_mad,
stat_cutoff = stat_cutoff,
p_unadj = p_unadj,
p_adj = p_adj,
significant = stat_resort$res,
ind = stat_resort$ind
)
# The significance determination has been made; therefore,
# that the following handling of the adjusted p-value seems as
# valid as adjusting a p-value in the first place ...
reportables <-
with(d <- reportables, d[order(-d$significant, -d$test_statistic), ])
if (FALSE) {
sig_pval <- with(d <- reportables, d[d$significant, ]$p_unadj)
if (sum(is.na(sig_pval)) > 0)
sig_pval[is.na(sig_pval)] <- 1.0
nonsig_pval <- with(d <- reportables, d[!d$significant, ]$p_unadj)
if (sum(is.na(nonsig_pval)) > 0)
nonsig_pval[is.na(nonsig_pval)] <- 1.0
# scale p-value to fit within decision threshold
if (length(sig_pval) > 0 && max(sig_pval) > alpha) {
sig_pval <- 1.0001 * alpha * sig_pval / max(sig_pval)
}
# scale p-value to fall outside decision threshold
plen <- min(5, length(nonsig_pval))
pseudo_min <- min(nonsig_pval[1:plen])
if (min(pseudo_min < alpha)) {
cat("initial min(nonsig_pval[1:n]) is ", pseudo_min, "\n", file = stderr())
# raise min to alpha, i.e., lower 1 - p to 1 - alpha
nonsig_pval <- 1 - nonsig_pval
nonsig_pval <- nonsig_pval * 0.9999 * pseudo_min / alpha
nonsig_pval <- 1 - nonsig_pval
cat("final min(nonsig_pval[1:n]) is ", nonsig_pval[1:plen], "\n", file = stderr())
}
reportables$p_adj <- c(sig_pval, nonsig_pval)
}
# restore order of reportables
reportables <- with(d <- reportables, d[order(d$ix), ])
attr(res, "reportables") <- reportables
# no need to duplicate the reportables in the context
#rm(reportables)
# return the environment in the event that future inspection is needed.
attr(res, "context") <- (list(new_env, environment)[[1]])()
# End of changes to the original code apart from formatting
# ---------------------------------------------------------------------------
# return result `res`
res
}
@eschen42
Copy link
Author

eschen42 commented Aug 6, 2023

I needed a version that would report unadjusted and adjusted p-values.

  • I think that my computation of the unadjusted p-value is correct.
  • I think that my computation of the adjusted p-value is defensible in light of the fact that the p-value is only part of the significance determination.

Demonstration code:

my_test <-
  function(n = 9, half_p = 3, alpha = 0.05, x = NULL) {
    p <- half_p + half_p
    trt <- c(rep.int(-0.5, half_p), rep.int(0.5, half_p))
    if (is.null(x)) {
      x <- cbind(trt, t(sapply(trt, function(s) rnorm(n, mean = s))))
      }
    rownames(x) <- c(paste0("lo", 1:half_p), paste0("hi", 1:half_p))
    colnames(x) <- c("trt", paste0("v", 1:n))
    input_data <<- x
    rslt <-
      LCTboot_inspect(
        input_data = x,
        alpha,
        stat_test=c('2nd.order', 'fisher')[2],
        Nboot = 0,
        vect = FALSE,
        arr.ind=FALSE
        )
    context <- attr(rslt, "context")
    reportables <- attr(rslt, "reportables")
    trt_reportables <- reportables[reportables$var1 == "trt",]
    if (FALSE) {
      trt_reportables$trt_cor <-
        sapply(
          X = unique(trt_reportables$var2),
          FUN = function(i) cor(x[,1], x[,i])
          )
    }
    # order by pseudo adjusted p, set context, and return
    trt_reportables <- trt_reportables[order(trt_reportables$p_adj), ]
    attr(trt_reportables, "context") <- environment()
    trt_reportables
  }

# order by test-statistic, which shows non-monotonic relationship of test statistic to p-values
head({
  my_rslt <- my_test(n = 60, half_p = 15, alpha = 0.05)
  my_rslt[order(-my_rslt$test_statistic),]
}, n = 30)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment