Skip to content

Instantly share code, notes, and snippets.

@AkselA
Last active February 3, 2023 07:56
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save AkselA/9dff13ef662f64188f708c22e0d455ad to your computer and use it in GitHub Desktop.
Save AkselA/9dff13ef662f64188f708c22e0d455ad to your computer and use it in GitHub Desktop.
T-test for partially paired data
set.seed(1)
n <- 40
p <- 0.3
r <- 0.5
m <- MASS::mvrnorm(n, c(0, 0), matrix(c(1, r, r, 1), 2))
m <- t(t(m) * c(1, 2))
m <- t(t(m) + c(51, 50))
m <- round(m, 1)
m[sample(n*2, p*n*2)] <- NA
colnames(m) <- c("test", "cont")
pair <- complete.cases(m)
t.test(m[,"test"][pair], m[,"cont"][pair], paired=TRUE, var.equal=FALSE)
t.test(m[,"test"], m[,"cont"], paired=FALSE, var.equal=FALSE)
t.test.partial(m)
list(paired = structure(list(test = c(52.87, 52.37, 47.39,
52.56, 54.43, 50.79), cont = c(52.51, 57.87, 51.41,
57.37, 56.84, 52.47)), class = "data.frame", row.names =
c(NA, -6L)), unpaired = structure(list(test = c(52.32, 53.26,
53.16, 50.23, 52.34, NA, NA, NA), cont = c(47.21, 46.38, 54.34,
60.15, 55.08, 55.4, 58.05, 55.88)), class = "data.frame",
row.names = c(NA, -8L))) -> m2
# m2 population ststistics
# mean test = 51
# mean cont = 54
# sd test = 2
# sd cont = 3.5
# cor = 0.8
# original n = 25
# % missing = 50
m2rb <- do.call(rbind, m2)
colMeans(m2rb, na.rm=TRUE)
apply(m2rb, 2, sd, na.rm=TRUE)
t.test(m2$paired[,1], m2$paired[,2], paired=TRUE)
t.test(m2rb[,1], m2rb[,2])
t.test.partial(m2$paired, m2$unpaired)
library(zoo)
library(MASS)
# parameters
set.seed(1)
rep <- 1e4
tty <- "samp"
if (tty == "cor") {
np <- floor(seq(6, 6, l=rep))
nu1 <- floor(seq(5, 5, l=rep))
nu2 <- floor(seq(8, 8, l=rep))
nu <- pmax(nu1, nu2)
mu1 <- seq(1, 1, l=rep)
mu2 <- seq(4, 4, l=rep)
sd1 <- seq(2, 2, l=rep)
sd2 <- seq(3.5, 3.5, l=rep)
r <- seq(0, 1, l=rep)
} else {
np <- floor(seq(2, 22, l=rep))
nu1 <- floor(seq(5, 5, l=rep))
nu2 <- floor(seq(8, 8, l=rep))
nu <- pmax(nu1, nu2)
mu1 <- seq(0, 2, l=rep)
mu2 <- seq(4, 4, l=rep)
sd1 <- seq(2, 2, l=rep)
sd2 <- seq(1, 5, l=rep)
r <- seq(0.6, 0.6, l=rep)
}
# test sequence
l <- list()
for (i in 1:rep) {
m <- mvrnorm(np[i], c(0, 0), matrix(c(1, r[i], r[i], 1), 2))
m[,1] <- m[,1] * sd1[i]
m[,2] <- m[,2] * sd2[i]
m[,1] <- m[,1] + mu1[i]
m[,2] <- m[,2] + mu2[i]
m <- as.data.frame(m, 2)
colnames(m) <- c("test", "cont")
m.u <- list(rnorm(nu1[i], mu1[i], sd1[i]), rnorm(nu2[i], mu2[i], sd2[i]))
m.u <- as.data.frame(lapply(m.u, "length<-", nu[i]))
colnames(m.u) <- c("test", "cont")
l[[i]] <- list(paired=m, unpaired=m.u)
}
t.l <- list()
for (i in 1:length(l)) {
t.l[[i]] <- with(l[[i]], list(
paired=t.test(paired[,1], paired[,2], paired=TRUE),
mix=t.test(c(paired[,1], unpaired[,1]), c(paired[,2], unpaired[,2])),
partial=t.test.partial(paired, unpaired)
)
)
}
col1 <- hsv(h=c(0, 240)/360, s=c(0.7, 0.7))
col2 <- hsv(h=c(0, 0, 240)/360, s=c(0.7, 0.7, 0.7), v=c(0, 1, 1))
col3 <- hsv(h=c(120, 30, 280)/360, s=c(0.7, 1, 0.8), v=c(0.8, 0.9, 0.9))
pvals <- function(x) {
do.call(rbind,
lapply(x, function(y) {
sapply(y, function(z) c(z$p.value))
}
))
}
# plot
layout(matrix(1:5), heights=c(1, 1, 1, 2, 2))
par(mar=c(0.1, 3, 0.2, 1.2), mgp=c(2, 0.6, 0), oma=c(1.5, 0, 0.2, 0),
xaxs="i", family="PT Mono", cex.lab=0.9, cex.axis=0.9)
matplot(cbind(mu1, mu2),
type="l", lty=1, lwd=1, col=col1, ylab="mean", xaxt="n", ylim=c(-0.2, 5))
matplot(cbind(sd1, sd2),
type="l", lty=1, lwd=1, col=col1, ylab="sd", xaxt="n", ylim=c(1, 5.3))
plot(r,
type="l", ylab="cor", xaxt="n", ylim=c(-0.05, 1))
matplot(cbind(np, nu1, nu2),
type="l", lty=1, lwd=1, col=col2, ylab="n samples", xaxt="n", ylim=c(2, 21.5),
frame.plot=FALSE)
box()
legend("topleft", c("paired", "unpaired1", "unpaired2"),
bty="n", text.col=col2, cex=0.9)
matplot(rollapply(pvals(t.l), 501, median, partial=TRUE),
type="l", lty=1, ylab="p-value (rolling median)", col=col3, ylim=c(0, 0.1),
frame.plot=FALSE)
abline(h=c(0.05, 0.01), col="#00000022")
text(rep, c(0.05, 0.01), c("5%", "1%"), xpd=TRUE, cex=0.9, col="grey", adj=-0.15)
box()
legend("topright", c("paired", "unpaired", "partial"),
bty="n", text.col=col3, cex=0.9)
quartz.save(paste0("ttp.", tty, ".pdf"), type="pdf")
t.test.partial <- function(paired, unpaired.x, unpaired.y,
alternative=c("two.sided", "less", "greater"), conf.level=0.95) {
# Also known as "Optimal pooled t-test"
# Beibei Guo and Ying Yuan (2015/2017)
# DOI: 10.1177/0962280215577111
if (any(is.na(paired)) & NCOL(paired) > 1) {
compc <- complete.cases(paired)
unpaired.x <- paired[!compc, 1]
unpaired.x <- unpaired.x[!is.na(unpaired.x)]
unpaired.y <- paired[!compc, 2]
unpaired.y <- unpaired.y[!is.na(unpaired.y)]
paired <- paired[compc, 1:2]
} else {
if (missing(unpaired.x) & missing(unpaired.y)) {
stop("missing unpaired samples")
}
}
# paired samples
if (NCOL(paired) > 1) {
xpd <- paired[, 1] - paired[, 2]
} else xpd <- paired
np <- length(xpd)
# unpaired samples
if (NCOL(unpaired.x) > 1) {
unpaired.y <- unpaired.x[, 2]
unpaired.x <- unpaired.x[, 1]
}
if (length(unpaired.x) < 2 | length(unpaired.y) < 2) {
stop("Too few unpaired samples")
}
xu1 <- unpaired.x
xu2 <- unpaired.y
xu1 <- xu1[!is.na(xu1)]
xu2 <- xu2[!is.na(xu2)]
nu1 <- length(xu1)
nu2 <- length(xu2)
vu1 <- var(xu1)
vu2 <- var(xu2)
# estimate variances
vp <- var(xpd)/np
vu <- vu1/nu1 + vu2/nu2
# weights
wt <- (1/vp) + (1/vu)
wp <- 1 / (wt * vp)
wu <- 1 / (wt * vu)
# estimates
ep <- mean(xpd)
eu <- mean(xu1) - mean(xu2)
et <- (ep*wp) + (eu*wu)
# degrees of freedom
dfp <- np - 1
dfu <- vu^2 / ( ((vu1/nu1)^2 / (nu1 - 1)) +
((vu2/nu2)^2 / (nu2 - 1)) )
dfd <- 1 / ((wp^2)/dfp + (wu^2)/dfu) # df of t-dist
# total estimate variance (standard error)
vt <- (1 + (4*wp*(1 - wp)/dfp) +
(4*wu*(1 - wu)/dfu))/wt
se <- sqrt(vt)
# t statistic
Ts <- et/se
# confidence interval
alpha <- 1 - conf.level
alt <- match.arg(alternative)
cint <- switch(alt,
"less" = c(-Inf, Ts + qt(conf.level, dfd)),
"greater" = c(Ts - qt(conf.level, dfd), Inf),
"two.sided" = {
ci <- qt(1 - alpha/2, dfd)
Ts + c(-ci, ci)
}
)
cint <- cint * se
# p-value
pval <- switch(alt,
"less" = pt(Ts, dfd),
"greater" = pt(Ts, dfd, lower.tail=FALSE),
"two.sided" = 2 * pt(-abs(Ts), dfd)
)
names(Ts) <- "t"
names(dfd) <- "df"
names(et) <- "mean of the differences"
attr(cint, "conf.level") <- conf.level
rval <- list(statistic=Ts, parameter=dfd, p.value=pval,
conf.int=cint, estimate=et, alternative=alt,
method="Partially paired t-test")
class(rval) <- "htest"
rval
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment