Skip to content

Instantly share code, notes, and snippets.

@brentp
Created May 3, 2013 19:52
Show Gist options
  • Save brentp/5513408 to your computer and use it in GitHub Desktop.
Save brentp/5513408 to your computer and use it in GitHub Desktop.
calculate p-value for a particular beta (or any measure), using a bootstrap-like measure. allows subsampling where as bootstrap is sampling with replacement. allows shuffling of residuals for linear model. the .iter version of each function will perform 200 iterations on the full dataset, then 5000 iterations on those with a simulated p-value < …
options(stringsAsFactors=FALSE)
library(limma)
library(parallel)
# fn returns the statistic of interest from a limma fit object.
bootstrip = function(mat, mod, fn, iterations=100, p_samples=0.5, mc.cores=12){
stopifnot(nrow(mod) == ncol(mat))
ta = mclapply(1:iterations, function(core_i){
idx = sample.int(ncol(mat), p_samples * ncol(mat), replace=TRUE)
msub = mat[,idx,drop=TRUE]
fit = lmFit(msub, mod[idx,,drop=TRUE])
fn(fit)
}, mc.cores=mc.cores)
beta = matrix(NA, nrow(mat), ncol=iterations)
for(i in 1:iterations){
beta[,i] = ta[[i]]
}
colnames(beta) = paste0("sample_", 1:iterations)
rownames(beta) = rownames(mat)
beta
}
bootstrip.iter = function(mat, mod, fn, iterations=c(200, 5000, 1e5), p_samples=0.5, mc.cores=12, smooth.sd=0){
lims = NA
subset = rep(TRUE, nrow(mat))
for(it in iterations){
message(paste("sampling", sum(subset), "rows", it, "times"))
beta = bootstrip(mat[subset,], mod, fn, it, p_samples, mc.cores)
if(any(is.na(lims))){
lims = bootstrip.limits(beta)
} else {
lims[subset,] = bootstrip.limits(beta)
}
lims[subset, "samples"] = it
# repeat on this subset. with a larger number of samples
subset = (lims$pvalue <= (2 / it))
if(sum(subset) < 2){ break }
}
lims$beta.orig = fn(lmFit(mat, mod))
lims
}
bootstrip.limits = function(beta, probs=c(0.025, 0.975)){
# pvalue is 2-tailed probability that the observed
# distribution overlaps 0.
pvalue = apply(beta, 1, function(row){
val=ecdf(row)(0); max(min(val, 1-val) * 2, 1/length(row))
})
df = data.frame(pvalue)
df$beta.mean = apply(beta, 1, mean)
df$beta.median = apply(beta, 1, median)
for(p in probs){
df[,paste0("beta.pct_", gsub(".", "p", p, fixed=T))] =
apply(beta, 1, function(row){ quantile(row, probs=p) })
}
df
}
permute.residuals = function(mat, mod, mod0, iterations=100, p_samples=1, mc.cores=12){
stopifnot(nrow(mod) == ncol(mat))
reduced_lm = lmFit(mat, mod0)
reduced_residuals = residuals(reduced_lm, mat)
reduced_fitted = fitted(reduced_lm)
fit = lmFit(mat, mod)
size = p_samples * nrow(mod)
coef.name = setdiff(colnames(mod), colnames(mod0))
beta.orig = coefficients(fit)[,coef.name]
rm(reduced_lm, fit); gc()
nc = ncol(reduced_residuals)
beta.list = mclapply(1:iterations, function(ix){
# TODO: make sure cols from fit match mod when taking subset
if( p_samples < 1){
sub_ids = sample.int(nc, size=size)
} else {
sub_ids = 1:nc
}
# take the original model and fitted data in same order, but
# shuffle residuals.
mat_sim = reduced_fitted[, sub_ids] + reduced_residuals[,sample(sub_ids)]
coefficients(lmFit(mat_sim, mod[sub_ids,]))[,coef.name]
}, mc.cores=mc.cores)
beta = matrix(NA, nrow(mat), ncol=iterations)
for(i in 1:iterations){
beta[,i] = beta.list[[i]]
}
df = data.frame(
pvalue = unlist(lapply(1:nrow(beta), function(i){
row = beta[i,]
val=ecdf(row)(beta.orig[i]);
max(min(val, 1-val) * 2, 1/length(row))
})), beta.orig = beta.orig)
rownames(df) = rownames(mat)
df
}
permute.residuals.iter = function(mat, mod, mod0, iterations=c(200, 5000, 1e5, 2e6),
p_samples=1, mc.cores=12){
stopifnot(nrow(mod) == ncol(mat))
subset = rep(TRUE, nrow(mat))
df = NA
for(it in iterations){
message(paste("sampling", sum(subset), "rows", it, "times"))
if(any(is.na(df))){
df = permute.residuals(mat, mod, mod0, it, p_samples, mc.cores)
} else {
# TODO: only sending in the most significant to permute. will bias results.
df[subset,] = permute.residuals(mat[subset,], mod, mod0, it, p_samples, mc.cores)
}
df[subset, "samples"] = it
# repeat on this subset. with a larger number of samples
subset = (df$pvalue <= (2 / it))
if(sum(subset) == 0){ break }
}
df
}
#library(multtest)
#mult.permute = function(mat, mod, coef){
#
# MTP(mat, Z=mod, test="lm.XvsZ", Z.incl=1:ncol(mod), Z.test=coef,
# get.adjp=TRUE, get.cr=TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment