Last active
December 13, 2021 17:13
-
-
Save mikelove/56e00143d486348c59885aed0d9cc4d2 to your computer and use it in GitHub Desktop.
bootRanges workflow thoughts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# `nullranges` block bootstrapping demo | |
# Michael Love | |
# Dec 13 2021 | |
library(dplyr) | |
library(tidyr) | |
library(plyranges) | |
library(nullranges) | |
# script needed below to make uniformly shuffled features | |
makeUniform <- function(n, R, chrlen) { | |
l <- replicate(R, { | |
p <- sort(GRanges("chr1", IRanges(sample(chrlen-10, n), width=10), strand="+")) | |
p$id <- paste0("pk", 1:n) | |
p | |
}, simplify=FALSE) | |
lens <- lengths(l) | |
gr <- do.call(c, l) | |
mcols(gr)$iter <- Rle(seq_len(R), lens) | |
gr | |
} | |
# make a set of example genes and peaks, the peaks happen to clump, | |
# i.e. there is a region of peaks and a region of no peaks | |
# this is a fairly simplistic case of clumping, whereas | |
# genomes tend to have multi-scale structure | |
chrlen <- 1e5 | |
set.seed(1) | |
ngenes <- 300 | |
genes <- sort(GRanges("chr1", | |
IRanges(sample(chrlen-10,ngenes,TRUE), width=10), | |
strand="+")) | |
npeaks <- 1000 | |
pks <- sort(GRanges("chr1", | |
IRanges(sample(chrlen-10,npeaks,TRUE), width=10), | |
strand="+")) | |
exclude <- GRanges("chr1", | |
IRanges(chrlen*c(.3,.5,.7), | |
width=chrlen/10)) | |
pks <- pks[ !pks %over% exclude ] | |
length(pks) | |
pks$id <- paste0("pk", seq_along(pks)) | |
seqlengths(pks) <- seqlengths(genes) <- c("chr1"=chrlen) | |
# transcription start site for overlaps | |
tss <- genes %>% | |
anchor_5p %>% | |
mutate(width=1) | |
# how many peaks within 30bp of a TSS? | |
obs <- tss %>% | |
mutate(n_overlaps = count_overlaps(., pks, maxgap=30)) %>% | |
summarize(total = mean(n_overlaps)) | |
obs | |
# what if we shuffle peaks, using a block bootstrap | |
# block lengths here are 1/10 of chromosome (not recommended for real data) | |
# this returns a GRanges, with length ~ R * length(pks) | |
# (plus or minus due to random block sampling) | |
R <- 200 | |
length(pks) * R # expect length of bootstraps concatenated | |
system.time({ | |
pks_prime <- pks %>% | |
bootRanges(blockLength=chrlen/10, R=R, | |
type="bootstrap", withinChrom=TRUE) | |
}) | |
length(pks_prime) | |
length(pks_prime) / (length(pks) * R) | |
print(object.size(pks_prime), units="Mb") | |
if (FALSE) { | |
R <- 5 | |
library(profvis) | |
library(ggplot2) | |
p <- profvis(bootRanges(pks, blockLength=chrlen/10, R=R, | |
type="bootstrap", withinChrom=TRUE)) | |
p | |
} | |
# what is we shuffle peaks, just selecting uniform start positions | |
# (function definition is at top of script) | |
system.time({ | |
pks_unif <- makeUniform(n=length(pks), R=R, chrlen) | |
}) | |
# now we calculate number of overlaps using a join, | |
# followed by grouping and summarization of number of peaks nearby | |
all_pks <- bind_ranges(boot=pks_prime, unif=pks_unif, .id="type") | |
n_tss <- length(tss) | |
# enrichment analysis for both sets | |
dat <- tss %>% | |
join_overlap_inner(all_pks, maxgap=30) %>% | |
group_by(type, iter) %>% | |
summarize(rate=n() / n_tss) %>% | |
as.data.frame() %>% | |
complete(type, iter, fill=list(rate=0)) | |
# compare the histograms of the null distributions | |
library(ggplot2) | |
ggplot(dat, aes(rate, fill=type, col=type)) + | |
geom_histogram(position="identity", alpha=0.3) + | |
xlab("rate of overlaps") + | |
geom_vline(xintercept=obs[1,1], size=1.1, lty=2, alpha=.5) | |
################### | |
## parallel case ## | |
################### | |
library(BiocParallel) | |
ncores <- 4 | |
bp <- MulticoreParam(workers=ncores) | |
seeds <- sample(1e4, ncores) | |
res <- bplapply(seq_along(seeds), function(i) { | |
set.seed(seeds[i]) | |
R <- 50 | |
pks_prime <- pks %>% | |
bootRanges(blockLength=chrlen/10, R=R, | |
type="bootstrap", withinChrom=TRUE) | |
dat <- tss %>% | |
join_overlap_inner(pks_prime, maxgap=30) %>% | |
group_by(iter) %>% # group over bootstrap iterations | |
summarize(rate=n() / n_tss) %>% # mean overlap | |
as.data.frame() %>% # GRanges -> df | |
complete(iter, fill=list(rate=0)) %>% # complete empties | |
mutate(iter=as.numeric(iter) + (i-1)*R) # shift iter | |
dat | |
}) | |
dat <- do.call(rbind, res) | |
ggplot(dat, aes(rate)) + geom_histogram() |
I would recommend either using bind_ranges()
+ group_by()
or IRanges::stack()
Notes from Nov-3-2021 meeting with Stuart:
- Make
iter
a factor Rle within bootRanges to avoid doing it later, shouldn't be as slow as factor (test this) - Parallelization and reproducibility: either store seed in
metadata(x)
[because we can't store it reliably inmcols(x)
, as ranges may be missing in the bootstrap sample, or after the overlap join], or the user can store the seed per chunk
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I guess the question for Stuart is, how does he recommend I manipulate the GRanges list to then perform grouped-by-iteration overlaps with plyranges.