mikelove/boot_workflow.R

## boot_workflow.R
# `nullranges` block bootstrapping demo
# Michael Love
# Dec 13 2021

library(dplyr)
library(tidyr)
library(plyranges)
library(nullranges)

# script needed below to make uniformly shuffled features
makeUniform <- function(n, R, chrlen) {
  l <- replicate(R, {
    p <- sort(GRanges("chr1", IRanges(sample(chrlen-10, n), width=10), strand="+"))
    p$id <- paste0("pk", 1:n)
    p
  }, simplify=FALSE)
  lens <- lengths(l)
  gr <- do.call(c, l)
  mcols(gr)$iter <- Rle(seq_len(R), lens)
  gr
}

# make a set of example genes and peaks, the peaks happen to clump,
# i.e. there is a region of peaks and a region of no peaks
# this is a fairly simplistic case of clumping, whereas
# genomes tend to have multi-scale structure
chrlen <- 1e5
set.seed(1)
ngenes <- 300
genes <- sort(GRanges("chr1",
                      IRanges(sample(chrlen-10,ngenes,TRUE), width=10),
                      strand="+"))
npeaks <- 1000
pks <- sort(GRanges("chr1",
                    IRanges(sample(chrlen-10,npeaks,TRUE), width=10),
                    strand="+"))
exclude <- GRanges("chr1",
                   IRanges(chrlen*c(.3,.5,.7),
                           width=chrlen/10))
pks <- pks[ !pks %over% exclude ]
length(pks)
pks$id <- paste0("pk", seq_along(pks))
seqlengths(pks) <- seqlengths(genes) <- c("chr1"=chrlen)

# transcription start site for overlaps
tss <- genes %>%
  anchor_5p %>%
  mutate(width=1)

# how many peaks within 30bp of a TSS?
obs <- tss %>%
  mutate(n_overlaps = count_overlaps(., pks, maxgap=30)) %>%
  summarize(total = mean(n_overlaps))
obs

# what if we shuffle peaks, using a block bootstrap
# block lengths here are 1/10 of chromosome (not recommended for real data)
# this returns a GRanges, with length ~ R * length(pks)
# (plus or minus due to random block sampling)
R <- 200
length(pks) * R # expect length of bootstraps concatenated
system.time({
  pks_prime <- pks %>%
    bootRanges(blockLength=chrlen/10, R=R,
               type="bootstrap", withinChrom=TRUE)
})
length(pks_prime)
length(pks_prime) / (length(pks) * R)
print(object.size(pks_prime), units="Mb")

if (FALSE) {
  R <- 5
  library(profvis)
  library(ggplot2)
  p <- profvis(bootRanges(pks, blockLength=chrlen/10, R=R,
                          type="bootstrap", withinChrom=TRUE))
  p
}

# what is we shuffle peaks, just selecting uniform start positions
# (function definition is at top of script)
system.time({
  pks_unif <- makeUniform(n=length(pks), R=R, chrlen)
})

# now we calculate number of overlaps using a join,
# followed by grouping and summarization of number of peaks nearby

all_pks <- bind_ranges(boot=pks_prime, unif=pks_unif, .id="type")

n_tss <- length(tss)

# enrichment analysis for both sets
dat <- tss %>%
  join_overlap_inner(all_pks, maxgap=30) %>%
  group_by(type, iter) %>%
  summarize(rate=n() / n_tss) %>%
  as.data.frame() %>%
  complete(type, iter, fill=list(rate=0))

# compare the histograms of the null distributions
library(ggplot2)
ggplot(dat, aes(rate, fill=type, col=type)) +
  geom_histogram(position="identity", alpha=0.3) +
  xlab("rate of overlaps") +
  geom_vline(xintercept=obs[1,1], size=1.1, lty=2, alpha=.5)

###################
## parallel case ##
###################

library(BiocParallel)
ncores <- 4
bp <- MulticoreParam(workers=ncores)
seeds <- sample(1e4, ncores)
res <- bplapply(seq_along(seeds), function(i) {
  set.seed(seeds[i])
  R <- 50
  pks_prime <- pks %>%
    bootRanges(blockLength=chrlen/10, R=R,
               type="bootstrap", withinChrom=TRUE)
  dat <- tss %>%
    join_overlap_inner(pks_prime, maxgap=30) %>%
    group_by(iter) %>% # group over bootstrap iterations
    summarize(rate=n() / n_tss) %>% # mean overlap
    as.data.frame() %>% # GRanges -> df
    complete(iter, fill=list(rate=0)) %>% # complete empties
    mutate(iter=as.numeric(iter) + (i-1)*R) # shift iter
  dat
})
dat <- do.call(rbind, res)

ggplot(dat, aes(rate)) + geom_histogram()
	# `nullranges` block bootstrapping demo
	# Michael Love
	# Dec 13 2021

	library(dplyr)
	library(tidyr)
	library(plyranges)
	library(nullranges)

	# script needed below to make uniformly shuffled features
	makeUniform <- function(n, R, chrlen) {
	l <- replicate(R, {
	p <- sort(GRanges("chr1", IRanges(sample(chrlen-10, n), width=10), strand="+"))
	p$id <- paste0("pk", 1:n)
	p
	}, simplify=FALSE)
	lens <- lengths(l)
	gr <- do.call(c, l)
	mcols(gr)$iter <- Rle(seq_len(R), lens)
	gr
	}

	# make a set of example genes and peaks, the peaks happen to clump,
	# i.e. there is a region of peaks and a region of no peaks
	# this is a fairly simplistic case of clumping, whereas
	# genomes tend to have multi-scale structure
	chrlen <- 1e5
	set.seed(1)
	ngenes <- 300
	genes <- sort(GRanges("chr1",
	IRanges(sample(chrlen-10,ngenes,TRUE), width=10),
	strand="+"))
	npeaks <- 1000
	pks <- sort(GRanges("chr1",
	IRanges(sample(chrlen-10,npeaks,TRUE), width=10),
	strand="+"))
	exclude <- GRanges("chr1",
	IRanges(chrlen*c(.3,.5,.7),
	width=chrlen/10))
	pks <- pks[ !pks %over% exclude ]
	length(pks)
	pks$id <- paste0("pk", seq_along(pks))
	seqlengths(pks) <- seqlengths(genes) <- c("chr1"=chrlen)

	# transcription start site for overlaps
	tss <- genes %>%
	anchor_5p %>%
	mutate(width=1)

	# how many peaks within 30bp of a TSS?
	obs <- tss %>%
	mutate(n_overlaps = count_overlaps(., pks, maxgap=30)) %>%
	summarize(total = mean(n_overlaps))
	obs

	# what if we shuffle peaks, using a block bootstrap
	# block lengths here are 1/10 of chromosome (not recommended for real data)
	# this returns a GRanges, with length ~ R * length(pks)
	# (plus or minus due to random block sampling)
	R <- 200
	length(pks) * R # expect length of bootstraps concatenated
	system.time({
	pks_prime <- pks %>%
	bootRanges(blockLength=chrlen/10, R=R,
	type="bootstrap", withinChrom=TRUE)
	})
	length(pks_prime)
	length(pks_prime) / (length(pks) * R)
	print(object.size(pks_prime), units="Mb")

	if (FALSE) {
	R <- 5
	library(profvis)
	library(ggplot2)
	p <- profvis(bootRanges(pks, blockLength=chrlen/10, R=R,
	type="bootstrap", withinChrom=TRUE))
	p
	}

	# what is we shuffle peaks, just selecting uniform start positions
	# (function definition is at top of script)
	system.time({
	pks_unif <- makeUniform(n=length(pks), R=R, chrlen)
	})

	# now we calculate number of overlaps using a join,
	# followed by grouping and summarization of number of peaks nearby

	all_pks <- bind_ranges(boot=pks_prime, unif=pks_unif, .id="type")

	n_tss <- length(tss)

	# enrichment analysis for both sets
	dat <- tss %>%
	join_overlap_inner(all_pks, maxgap=30) %>%
	group_by(type, iter) %>%
	summarize(rate=n() / n_tss) %>%
	as.data.frame() %>%
	complete(type, iter, fill=list(rate=0))

	# compare the histograms of the null distributions
	library(ggplot2)
	ggplot(dat, aes(rate, fill=type, col=type)) +
	geom_histogram(position="identity", alpha=0.3) +
	xlab("rate of overlaps") +
	geom_vline(xintercept=obs[1,1], size=1.1, lty=2, alpha=.5)

	###################
	## parallel case ##
	###################

	library(BiocParallel)
	ncores <- 4
	bp <- MulticoreParam(workers=ncores)
	seeds <- sample(1e4, ncores)
	res <- bplapply(seq_along(seeds), function(i) {
	set.seed(seeds[i])
	R <- 50
	pks_prime <- pks %>%
	bootRanges(blockLength=chrlen/10, R=R,
	type="bootstrap", withinChrom=TRUE)
	dat <- tss %>%
	join_overlap_inner(pks_prime, maxgap=30) %>%
	group_by(iter) %>% # group over bootstrap iterations
	summarize(rate=n() / n_tss) %>% # mean overlap
	as.data.frame() %>% # GRanges -> df
	complete(iter, fill=list(rate=0)) %>% # complete empties
	mutate(iter=as.numeric(iter) + (i-1)*R) # shift iter
	dat
	})
	dat <- do.call(rbind, res)

	ggplot(dat, aes(rate)) + geom_histogram()