Skip to content

Instantly share code, notes, and snippets.

@nvictus
Last active August 27, 2021 15:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nvictus/460a6014a075156713fea5e09bdf3498 to your computer and use it in GitHub Desktop.
Save nvictus/460a6014a075156713fea5e09bdf3498 to your computer and use it in GitHub Desktop.
coolR: a cooler reader for R
# Notes
# -----
# * Cooler's stored bin IDs are 0-based. However, for consistency with R, this API should take
# 1-based indexing as input for table row and matrix range queries.
# * See Ilya's implementation: https://github.com/dozmorovlab/HiCcompare/issues/9
library(hdf5r)
library(dplyr)
library(tibble)
library(purrr)
library(data.table)
library(Matrix)
library(slam)
library(R6)
#' Return the search indexes of a cooler
#'
#' @param clr H5File or H5Group bearing a cooler data collection.
read_indexes <- function(clr) {
chrom_offset <- clr[['indexes/chrom_offset']][] + 1
bin1_offset <- clr[['indexes/bin1_offset']][] + 1
return (list("chrom_offset"=chrom_offset,
"bin1_offset"=bin1_offset))
}
#' Query a columnar table stored as 1D arrays in an HDF5 group
#'
#' @param clr H5File or H5Group
#' @param path character. Path to the group containing the columns.
#' @param columns list. List of columns to read from. Default is all columns.
#' @param span pair. Range of rows of the table to read. Default is all rows.
read_table <- function(clr, path, columns=NULL, span=NULL) {
grp <- clr[[path]]
if (is.null(columns)) {
columns <- grp$names
}
if (is.null(span)) {
dset <- grp[[columns[1]]]
len <- dset$dims[1]
span <- c(1, len)
}
n_cols <- length(columns)
x <- vector(mode="list")
for (col in columns) {
vals <- grp[[col]][span[1]:span[2]]
if (is.factor_ext(vals)) {
vals <- as.character(vals)
}
x[[col]] <- vals
}
return (cbind.data.frame(x))
}
#' Join bin information against a selection of pixels
#'
#' @param pixels data.frame. A table of pixel records containing bin1_id and
#' bin2_id columns.
#' @param bins data.frame. The entire bin table.
annotate <- function(pixels, bins) {
bins['idx'] <- as.numeric(rownames(bins))
pixels <- inner_join(bins, pixels, by=c("idx" = "bin2_id"))
pixels <- pixels[,-which(names(pixels) == "idx")]
pixels <- inner_join(bins, pixels, by=c("idx" = "bin1_id"), suffix=c("1", "2"))
pixels <- pixels[,-which(names(pixels) == "idx")]
return (pixels)
}
.arg.prune.partition <- function(x, step) {
lo <- x[[1]]
hi <- x[[length((x))]]
num <- 2 + (hi - lo) / step
cuts <- seq(lo, hi, length.out=num)
return ( unique(findInterval(cuts, x)) )
}
.comes.before <- function(a0, a1, b0, b1, strict=FALSE) {
if (a0 < b0) {
if (strict) {
return (a1 <= b0)
} else {
return (a1 <= b1)
}
} else {
return (FALSE)
}
}
.contains <- function(a0, a1, b0, b1, strict=FALSE) {
if (a0 > b0 || a1 < b1) {
return (FALSE)
}
if (strict && (a0 == b0 || a1 == b1)) {
return (FALSE)
}
return (a0 <= b0 && a1 >= b1)
}
#' 2D range query processor
#'
#' Designed to read the pixel data in manageable chunks.
#'
#' @section Public fields:
#' * `clr`: H5File or H5Group
#' * `i1`, `i2`: row range (1-based)
#' * `j1`, `j2`: column range (1-based)
#' * `field`: value column to read (default is "count")
#' * `chunksize`: number of pixel records per batch
RangeQuery <- R6Class("RangeQuery",
public = list(
clr=NULL,
i1=NULL,
i2=NULL,
j1=NULL,
j2=NULL,
field=NULL,
chunksize=NULL,
bin1_offsets=NULL,
loc_pruned_offsets=NULL,
n_chunks=NULL,
initialize = function(clr, i1, i2, j1, j2, field="count", chunksize=10000000) {
self$clr <- clr
self$i1 <- i1
self$i2 <- i2
self$j1 <- j1
self$j2 <- j2
self$field <- field
self$chunksize <- chunksize
indexes <- read_indexes(clr)
self$bin1_offsets <- indexes[["bin1_offset"]][self$i1:self$i2]
# subsample the offsets to produce batches of pixel records roughly
# `chunksize` in length corresponding to full matrix rows
self$loc_pruned_offsets <- .arg.prune.partition(self$bin1_offsets, chunksize)
self$n_chunks <- length(self$loc_pruned_offsets) - 1
},
read = function() {
out <- list()
for (i in seq_len(self$n_chunks)) {
out[[i]] <- self$read_chunk(i)
}
return (rbindlist(out))
},
read_chunk = function(chunk_id, mirror=FALSE) {
i1 <- self$i1
i2 <- self$i2
j1 <- self$j1
j2 <- self$j2
# extract data for a chunk of matrix rows
oi <- self$loc_pruned_offsets[chunk_id]
of <- self$loc_pruned_offsets[chunk_id + 1]
p1 <- self$bin1_offsets[oi]
p2 <- self$bin1_offsets[of] - 1
bin2_extracted <- clr[['pixels/bin2_id']][p1:p2]
data_extracted <- clr[['pixels/count']][p1:p2]
# iterate over matrix rows and filter
dfs <- list()
for (i in seq(oi, of-1)) {
# correct the offsets
lo <- self$bin1_offsets[i] - p1 + 1
hi <- self$bin1_offsets[i + 1] - p1
if ((hi - lo) > 0) {
# this row
bin2 <- bin2_extracted[lo:hi] + 1 # change to 1-based
datx <- data_extracted[lo:hi]
if (any(is.na(bin2))) {
print(length(bin2_extracted))
print(length(data_extracted))
print(self$bin1_offsets[i] - p1)
print(self$bin1_offsets[i + 1] - p1)
print(bin2)
print(datx)
}
# filter for the range of j values we want
mask <- ((bin2 >= j1) & (bin2 <= j2))
cols <- bin2[mask]
data <- datx[mask]
rows <- rep(i1 + i - 1, length(cols)) # 1-based
dfs[[i]] <- data.frame(
bin1_id=rows,
bin2_id=cols,
count=data)
} else {
dfs[[i]] <- data.frame(row.names=c("bin1_id", "bin2_id", "count"))
}
}
df <- rbindlist(dfs)
if (mirror) {
nodiag <- df[["bin1_id"]] != df[["bin2_id"]]
df <- rbind(df,
rename(df[nodiag,],
bin1_id=bin2_id, bin2_id=bin1_id))
}
return (df)
}
)
)
#' A mirroring 2D range query processor for a symmetric matrix stored as upper
#' triangular data.
#'
#' Designed to read the pixel data in manageable chunks and fill in the lower
#' triangular data. Mirroring requires breaking up the query into one or more
#' unmirrored subqueries.
#'
#' @section Public fields:
#' * `clr`: H5File or H5Group
#' * `i1`, `i2`: row range (1-based)
#' * `j1`, `j2`: column range (1-based)
#' * `field`: value column to read (default is "count")
#' * `chunksize`: number of pixel records per batch
SymmTriuRangeQuery <- R6Class("SymmTriuRangeQuery",
public = list(
clr=NULL,
i1=NULL,
i2=NULL,
j1=NULL,
j2=NULL,
field=NULL,
chunksize=NULL,
transpose=NULL,
mode=NULL,
queries=NULL,
initialize = function(clr, i1, i2, j1, j2,
field="count", chunksize=10000000) {
self$clr <- clr
self$field <- field
self$chunksize <- chunksize
self$transpose <- FALSE
if (i1 == j1 && i2 == j2) {
self$queries <- list(
RangeQuery$new(clr, i1, i2, j1, j2, field, chunksize)
)
self$mode = 1
} else {
if (j1 < i1 || (i1 == j1 && i2 < j2)) {
t <- i1; i1 <- j1; j1 <- t
t <- i2; i2 <- j2; j2 <- t
self$transpose <- TRUE
}
if (.comes.before(i1, i2, j1, j2, strict=TRUE)) {
self$queries <- list(
RangeQuery$new(clr, i1, i2, j1, j2, field, chunksize)
)
self$mode = 2
} else if (.comes.before(i1, i2, j1, j2)) {
self$queries <- list(
RangeQuery$new(clr, i1, j1, j1, i2, field, chunksize),
RangeQuery$new(clr, j1, i2, j1, i2, field, chunksize),
RangeQuery$new(clr, i1, i2, i2, j2, field, chunksize)
)
self$mode = 3
} else if (.contains(i1, i2, j1, j2)) {
self$queries <- list(
RangeQuery$new(clr, i1, j1, j1, j2, field, chunksize),
RangeQuery$new(clr, j1, j2, j1, j2, field, chunksize),
RangeQuery$new(clr, j1, j2, j2, i2, field, chunksize)
)
self$mode = 4
} else {
stop("this shouldn't happen")
}
}
},
read_chunk = function(q, i) {
#' @param q int. Index of range subquery object to use.
#' @param i int. Index of chunk to extract from subquery.
#' @return data.frame.
rq <- self$queries[[q]]
mirror <- if (self$mode == 1 || q == 2) TRUE else FALSE
out <- rq$read_chunk(i, mirror=mirror)
if (self$mode == 4 && q == 3) {
if (!self$transpose) {
out <- rename(out, bin1_id="bin2_id", bin2_id="bin1_id")
}
} else if (self$transpose) {
out <- rename(out, bin1_id="bin2_id", bin2_id="bin1_id")
}
return (out)
},
read = function() {
#' Process the query
#'
#' @return data.frame
out <- list()
k <- 1
n_queries <- length(self$queries)
for (q in seq_len(n_queries)) {
n_chunks <- self$queries[[q]]$n_chunks
for (i in seq_len(n_chunks)) {
out[[k]] <- self$read_chunk(q, i)
k <- k + 1
}
}
return (rbindlist(out, use.names=TRUE))
}
)
)
# table queries
clr <- H5File$new('~/tmp/coolers/Rao2014-GM12878-MboI-allreps-filtered.10kb.cool', mode='r')
bins <- read_table(clr, 'bins')
sel <- read_table(clr, 'pixels', span=c(1, 10))
sel <- annotate(sel, bins)
# matrix range query
# TODO: here, bin IDs are returned as 1-based, which is inconsistent with the table queries above.
i1 <- 3000
i2 <- 4000
j1 <- 3000
j2 <- 4000
rq <- SymmTriuRangeQuery$new(clr, i1, i2, j1, j2)
df <- rq$read()
m <- sparseMatrix(i=df[['bin1_id']] - i1 + 1,
j=df[['bin2_id']] - j1 + 1,
x=df[['count']] * (bins[df[['bin1_id']], 'weight']) * (bins[df[['bin2_id']], 'weight']),
dims=c(i2-i1+1, j2-j1+1))
M <- as.matrix(m)
image(
seq(j1, j2),
seq(i1, i2),
log10(t(M)),
useRaster=TRUE,
asp=1,
ylim=c(i2, i1),
col=hcl.colors(256, "YlOrRd", rev = TRUE)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment