Skip to content

Instantly share code, notes, and snippets.

@arraytools
Last active August 22, 2022 13:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arraytools/d2e07a5df3377465c39ecc98551f0c0f to your computer and use it in GitHub Desktop.
Save arraytools/d2e07a5df3377465c39ecc98551f0c0f to your computer and use it in GitHub Desktop.
t-test for genomic data
rowVars <- function(x, na.rm=FALSE, dims=1, unbiased=TRUE, SumSquares=FALSE,
twopass=FALSE) {
if (SumSquares) return(rowSums(x^2, na.rm, dims))
N <- rowSums(!is.na(x), FALSE, dims)
Nm1 <- if (unbiased) N-1 else N
if (twopass) {x <- if (dims==0) x - mean(x, na.rm=na.rm) else
sweep(x, 1:dims, rowMeans(x,na.rm,dims))}
(rowSums(x^2, na.rm, dims) - rowSums(x, na.rm, dims)^2/N) / Nm1
}
Vat2 <- function(x,label,grp=c(0,1), rvm, a,b, warn = TRUE) {
# Two sample t test and v test
# category is defined by 0,1,2,... with 0 the 2nd set of samples
# Output:
# mn: grp[2] - grp[1] OR x_{label==1} - x_{label==0}
# mn1 or mntest: grp[2] OR x_{label==1} OR mean of the test samples
# mn2 or mnref: grp[1] OR x_{label==0} OR mean of the reference samples
# weight: weight=(n-k)/[(n-k)+2a], k is the number of classes
# k=2 in this case.
#
# tmp <- Vat2(matrix(c(1:3, 11:13),1,6), c(0,0,0,1,1,1), rvm = FALSE)
# tmp$tp # 0.0002552167
#
# Method 2.
# t.test(1:3, 11:13, var.equal = T)$p.value # 0.0002552167
#
# Method 3.
# require(sva)
# pheno <- data.frame(group = c(0,0,0,1,1,1))
# mod = model.matrix(~as.factor(group), data = pheno)
# mod0 = model.matrix(~1, data = pheno)
# f.pvalue(matrix(c(1:3, 11:13),1,6), mod, mod0) # 0.0002552167
#
# Method 4.
# library(limma)
# design <- model.matrix(~ factor(c(1,1,1,2,2,2))) # number of samples x number of groups
# colnames(design) <- c("group1", "group2")
# fit <- lmFit(matrix(c(1:3, 11:13),1,6), design)
# unmod.t <- fit$coefficients/fit$stdev.unscaled/fit$sigma
# pval <- 2*pt(-abs(unmod.t), fit$df.residual) # (intercept, group)
# pval[2] # [1] 0.0002552167
#
# Method 5. seems to be the easiest
# library(genefilter)
# r1 = rowttests(matrix(c(1:3, 11:13),1,6), factor(c(0,0,0,1,1,1)))
# r1$p.value # [1] 0.0002552167
label <- as.integer(label)
if (any(table(label) == 1)) {
if (warn) warning("At least two observations are needed in each class for the T test")
return(list(er=1))
}
dat1 <- x[,label==grp[2],drop = FALSE]
dat2 <- x[,label==grp[1],drop = FALSE]
vr1 <- rowVars(dat1,na.rm=TRUE)
vr2 <- rowVars(dat2,na.rm=TRUE)
n1 <- rowSums(!is.na(dat1))
n2 <- rowSums(!is.na(dat2))
mn1 <- rowMeans(dat1,na.rm=TRUE)
mn2 <- rowMeans(dat2,na.rm=TRUE)
mn <- mn1-mn2
n <- n1+n2
vr <- (n1-1)*vr1+(n2-1)*vr2
vr <- vr/(n-2)
newn <- 1/(1/n1+1/n2)
t <- mn/sqrt(vr/newn)
n[n<3] <- 3
tp <- 2*(1-pt(abs(t),df=n-2))
out <- list(sequid=1:length(n1), n1=n1, n2=n2, mn1=mn1, mn2=mn2,
vr1=vr1, vr2=vr2, mn=mn, vr=vr, n=n, t=t, tp=tp, mntest=mn1, mnref=mn2, er=0)
if (rvm) {
if (missing(a) | missing(b)) {
a <- getab(vr,n-2)
b <- a[2]
a <- a[1]
}
v <- newn*(1+2*a/(n-2))/(vr+2/((n-2)*b))
v <- mn*sqrt(v)
vp <- 2*(1-pt(abs(v),df=n-2+2*a))
wt <- (n-2)/((n-2)+2*a)
out$v <- v
out$vp <- vp
out$weight <- wt
out$a <- a
out$b <- b
}
invisible(out)
}
set.seed(1)
ex <- matrix(rnorm(1000*10), nr=1000)
grp <- c(rep(0,5), rep(1, 5))
# Method 1. Vat2()
system.time(tmp <- Vat2(ex, grp, rvm = FALSE)) # .002
p1 <- tmp$tp
# Method 2. t.test
p2 <- rep(NA, nrow(ex))
system.time(for(i in 1:nrow(ex)) p2[i] <- t.test(ex[i, 1:5], ex[i, 6:10], var.equal = TRUE)$p.value) # 0.218
# Method 3. sva
pheno <- data.frame(group = grp)
mod = model.matrix(~as.factor(group), data = pheno)
mod0 = model.matrix(~1, data = pheno)
system.time(p3 <- f.pvalue(ex, mod, mod0) ) # .001
# Method 4. limma
design <- model.matrix(~ factor(grp)) # number of samples x number of groups
colnames(design) <- c("intercept", "group")
system.time(fit <- lmFit(ex, design)) # .003
unmod.t <- fit$coefficients/fit$stdev.unscaled/fit$sigma
pval <- 2*pt(-abs(unmod.t), fit$df.residual)
p4 <- pval[, 2]
# Method 5. genefilter
system.time(r1 <- rowttests(ex, factor(grp)) ) # .002
p5 <- r1$p.value
all.equal(p1, p5)
all.equal(p2, p5)
all.equal(p3, p5)
all.equal(p4, p5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment