nvictus/cool.r

## cool.r
# Notes
# -----
# * Cooler's stored bin IDs are 0-based. However, for consistency with R, this API should take
#   1-based indexing as input for table row and matrix range queries.
# * See Ilya's implementation: https://github.com/dozmorovlab/HiCcompare/issues/9
library(hdf5r)
library(dplyr)
library(tibble)
library(purrr)
library(data.table)
library(Matrix)
library(slam)
library(R6)


#' Return the search indexes of a cooler
#'
#' @param clr H5File or H5Group bearing a cooler data collection.
read_indexes <- function(clr) {
  chrom_offset <- clr[['indexes/chrom_offset']][] + 1
  bin1_offset <- clr[['indexes/bin1_offset']][] + 1

  return (list("chrom_offset"=chrom_offset,
               "bin1_offset"=bin1_offset))
}


#' Query a columnar table stored as 1D arrays in an HDF5 group
#'
#' @param clr H5File or H5Group
#' @param path character. Path to the group containing the columns.
#' @param columns list. List of columns to read from. Default is all columns.
#' @param span pair. Range of rows of the table to read. Default is all rows.
read_table <- function(clr, path, columns=NULL, span=NULL) {

  grp <- clr[[path]]

  if (is.null(columns)) {
    columns <- grp$names
  }

  if (is.null(span)) {
    dset <- grp[[columns[1]]]
    len <- dset$dims[1]
    span <- c(1, len)
  }

  n_cols <- length(columns)
  x <- vector(mode="list")
  for (col in columns) {
    vals <- grp[[col]][span[1]:span[2]]

    if (is.factor_ext(vals)) {
      vals <- as.character(vals)
    }
    x[[col]] <- vals
  }

  return (cbind.data.frame(x))
}


#' Join bin information against a selection of pixels
#'
#' @param pixels data.frame. A table of pixel records containing bin1_id and
#' bin2_id columns.
#' @param bins data.frame. The entire bin table.
annotate <- function(pixels, bins) {
  bins['idx'] <- as.numeric(rownames(bins))
  pixels <- inner_join(bins, pixels, by=c("idx" = "bin2_id"))
  pixels <- pixels[,-which(names(pixels) == "idx")]
  pixels <- inner_join(bins, pixels, by=c("idx" = "bin1_id"), suffix=c("1", "2"))
  pixels <- pixels[,-which(names(pixels) == "idx")]
  return (pixels)
}


.arg.prune.partition <- function(x, step) {
  lo <- x[[1]]
  hi <- x[[length((x))]]
  num <- 2 + (hi - lo) / step
  cuts <- seq(lo, hi, length.out=num)
  return ( unique(findInterval(cuts, x)) )
}


.comes.before <- function(a0, a1, b0, b1, strict=FALSE) {
  if (a0 < b0) {
    if (strict) {
      return (a1 <= b0)
    } else {
      return (a1 <= b1)
    }
  } else {
    return (FALSE)
  }
}


.contains <- function(a0, a1, b0, b1, strict=FALSE) {
  if (a0 > b0 || a1 < b1) {
    return (FALSE)
  }
  if (strict && (a0 == b0 || a1 == b1)) {
    return (FALSE)
  }
  return (a0 <= b0 && a1 >= b1)
}


#' 2D range query processor
#'
#' Designed to read the pixel data in manageable chunks.
#'
#' @section Public fields:
#' * `clr`: H5File or H5Group
#' * `i1`, `i2`: row range (1-based)
#' * `j1`, `j2`: column range (1-based)
#' * `field`: value column to read (default is "count")
#' * `chunksize`: number of pixel records per batch
RangeQuery <- R6Class("RangeQuery",
  public = list(
    clr=NULL,
    i1=NULL,
    i2=NULL,
    j1=NULL,
    j2=NULL,
    field=NULL,
    chunksize=NULL,
    bin1_offsets=NULL,
    loc_pruned_offsets=NULL,
    n_chunks=NULL,
    initialize = function(clr, i1, i2, j1, j2, field="count", chunksize=10000000) {
      self$clr <- clr
      self$i1 <- i1
      self$i2 <- i2
      self$j1 <- j1
      self$j2 <- j2
      self$field <- field
      self$chunksize <- chunksize
      indexes <- read_indexes(clr)
      self$bin1_offsets <- indexes[["bin1_offset"]][self$i1:self$i2]

      # subsample the offsets to produce batches of pixel records roughly
      # `chunksize` in length corresponding to full matrix rows
      self$loc_pruned_offsets <- .arg.prune.partition(self$bin1_offsets, chunksize)
      self$n_chunks <- length(self$loc_pruned_offsets) - 1
    },
    read = function() {
      out <- list()
      for (i in seq_len(self$n_chunks)) {
        out[[i]] <- self$read_chunk(i)
      }
      return (rbindlist(out))
    },
    read_chunk = function(chunk_id, mirror=FALSE) {
      i1 <- self$i1
      i2 <- self$i2
      j1 <- self$j1
      j2 <- self$j2

      # extract data for a chunk of matrix rows
      oi <- self$loc_pruned_offsets[chunk_id]
      of <- self$loc_pruned_offsets[chunk_id + 1]
      p1 <- self$bin1_offsets[oi]
      p2 <- self$bin1_offsets[of] - 1
      bin2_extracted <- clr[['pixels/bin2_id']][p1:p2]
      data_extracted <- clr[['pixels/count']][p1:p2]

      # iterate over matrix rows and filter
      dfs <- list()
      for (i in seq(oi, of-1)) {
        # correct the offsets
        lo <- self$bin1_offsets[i] - p1 + 1
        hi <- self$bin1_offsets[i + 1] - p1

        if ((hi - lo) > 0) {
          # this row
          bin2 <- bin2_extracted[lo:hi] + 1  # change to 1-based
          datx <- data_extracted[lo:hi]

          if (any(is.na(bin2))) {
            print(length(bin2_extracted))
            print(length(data_extracted))

            print(self$bin1_offsets[i] - p1)
            print(self$bin1_offsets[i + 1] - p1)

            print(bin2)
            print(datx)
          }

          # filter for the range of j values we want
          mask <- ((bin2 >= j1) & (bin2 <= j2))
          cols <- bin2[mask]
          data <- datx[mask]
          rows <- rep(i1 + i - 1, length(cols))  # 1-based

          dfs[[i]] <- data.frame(
            bin1_id=rows,
            bin2_id=cols,
            count=data)
        } else {
          dfs[[i]] <- data.frame(row.names=c("bin1_id", "bin2_id", "count"))
        }
      }
      df <- rbindlist(dfs)
      if (mirror) {
        nodiag <- df[["bin1_id"]] != df[["bin2_id"]]
        df <- rbind(df,
                    rename(df[nodiag,],
                           bin1_id=bin2_id, bin2_id=bin1_id))
      }
      return (df)
    }
  )
)

#' A mirroring 2D range query processor for a symmetric matrix stored as upper
#' triangular data.
#'
#' Designed to read the pixel data in manageable chunks and fill in the lower
#' triangular data. Mirroring requires breaking up the query into one or more
#' unmirrored subqueries.
#'
#' @section Public fields:
#' * `clr`: H5File or H5Group
#' * `i1`, `i2`: row range (1-based)
#' * `j1`, `j2`: column range (1-based)
#' * `field`: value column to read (default is "count")
#' * `chunksize`: number of pixel records per batch
SymmTriuRangeQuery <- R6Class("SymmTriuRangeQuery",
  public = list(
    clr=NULL,
    i1=NULL,
    i2=NULL,
    j1=NULL,
    j2=NULL,
    field=NULL,
    chunksize=NULL,
    transpose=NULL,
    mode=NULL,
    queries=NULL,
    initialize = function(clr, i1, i2, j1, j2,
                          field="count", chunksize=10000000) {
      self$clr <- clr
      self$field <- field
      self$chunksize <- chunksize
      self$transpose <- FALSE
      if (i1 == j1 && i2 == j2) {
        self$queries <- list(
          RangeQuery$new(clr, i1, i2, j1, j2, field, chunksize)
        )
        self$mode = 1
      } else {
        if (j1 < i1 || (i1 == j1 && i2 < j2)) {
          t <- i1; i1 <- j1; j1 <- t
          t <- i2; i2 <- j2; j2 <- t
          self$transpose <- TRUE
        }
        if (.comes.before(i1, i2, j1, j2, strict=TRUE)) {
          self$queries <- list(
            RangeQuery$new(clr, i1, i2, j1, j2, field, chunksize)
          )
          self$mode = 2
        } else if (.comes.before(i1, i2, j1, j2)) {
          self$queries <- list(
            RangeQuery$new(clr, i1, j1, j1, i2, field, chunksize),
            RangeQuery$new(clr, j1, i2, j1, i2, field, chunksize),
            RangeQuery$new(clr, i1, i2, i2, j2, field, chunksize)
          )
          self$mode = 3
        } else if (.contains(i1, i2, j1, j2)) {
          self$queries <- list(
            RangeQuery$new(clr, i1, j1, j1, j2, field, chunksize),
            RangeQuery$new(clr, j1, j2, j1, j2, field, chunksize),
            RangeQuery$new(clr, j1, j2, j2, i2, field, chunksize)
          )
          self$mode = 4
        } else {
          stop("this shouldn't happen")
        }
      }
    },
    read_chunk = function(q, i) {
      #' @param q int. Index of range subquery object to use.
      #' @param i int. Index of chunk to extract from subquery.
      #' @return data.frame.
      rq <- self$queries[[q]]
      mirror <- if (self$mode == 1 || q == 2) TRUE else FALSE
      out <- rq$read_chunk(i, mirror=mirror)
      if (self$mode == 4 && q == 3) {
        if (!self$transpose) {
          out <- rename(out, bin1_id="bin2_id", bin2_id="bin1_id")
        }
      } else if (self$transpose) {
        out <- rename(out, bin1_id="bin2_id", bin2_id="bin1_id")
      }
      return (out)
    },
    read = function() {
      #' Process the query
      #'
      #' @return data.frame
      out <- list()
      k <- 1
      n_queries <- length(self$queries)
      for (q in seq_len(n_queries)) {
        n_chunks <- self$queries[[q]]$n_chunks
        for (i in seq_len(n_chunks)) {
          out[[k]] <- self$read_chunk(q, i)
          k <- k + 1
        }
      }
      return (rbindlist(out, use.names=TRUE))
    }
  )
)


# table queries
clr <- H5File$new('~/tmp/coolers/Rao2014-GM12878-MboI-allreps-filtered.10kb.cool', mode='r')
bins <- read_table(clr, 'bins')
sel <- read_table(clr, 'pixels', span=c(1, 10))
sel <- annotate(sel, bins)

# matrix range query
# TODO: here, bin IDs are returned as 1-based, which is inconsistent with the table queries above.
i1 <- 3000
i2 <- 4000
j1 <- 3000
j2 <- 4000
rq <- SymmTriuRangeQuery$new(clr, i1, i2, j1, j2)
df <- rq$read()
m <- sparseMatrix(i=df[['bin1_id']] - i1 + 1,
                  j=df[['bin2_id']] - j1 + 1,
                  x=df[['count']] * (bins[df[['bin1_id']], 'weight']) * (bins[df[['bin2_id']], 'weight']),
                  dims=c(i2-i1+1, j2-j1+1))
M <- as.matrix(m)
image(
    seq(j1, j2),
    seq(i1, i2),
    log10(t(M)),
    useRaster=TRUE,
    asp=1,
    ylim=c(i2, i1),
    col=hcl.colors(256, "YlOrRd", rev = TRUE)
)
	# Notes
	# -----
	# * Cooler's stored bin IDs are 0-based. However, for consistency with R, this API should take
	# 1-based indexing as input for table row and matrix range queries.
	# * See Ilya's implementation: https://github.com/dozmorovlab/HiCcompare/issues/9
	library(hdf5r)
	library(dplyr)
	library(tibble)
	library(purrr)
	library(data.table)
	library(Matrix)
	library(slam)
	library(R6)


	#' Return the search indexes of a cooler
	#'
	#' @param clr H5File or H5Group bearing a cooler data collection.
	read_indexes <- function(clr) {
	chrom_offset <- clr[['indexes/chrom_offset']][] + 1
	bin1_offset <- clr[['indexes/bin1_offset']][] + 1

	return (list("chrom_offset"=chrom_offset,
	"bin1_offset"=bin1_offset))
	}


	#' Query a columnar table stored as 1D arrays in an HDF5 group
	#'
	#' @param clr H5File or H5Group
	#' @param path character. Path to the group containing the columns.
	#' @param columns list. List of columns to read from. Default is all columns.
	#' @param span pair. Range of rows of the table to read. Default is all rows.
	read_table <- function(clr, path, columns=NULL, span=NULL) {

	grp <- clr[[path]]

	if (is.null(columns)) {
	columns <- grp$names
	}

	if (is.null(span)) {
	dset <- grp[[columns[1]]]
	len <- dset$dims[1]
	span <- c(1, len)
	}

	n_cols <- length(columns)
	x <- vector(mode="list")
	for (col in columns) {
	vals <- grp[[col]][span[1]:span[2]]

	if (is.factor_ext(vals)) {
	vals <- as.character(vals)
	}
	x[[col]] <- vals
	}

	return (cbind.data.frame(x))
	}


	#' Join bin information against a selection of pixels
	#'
	#' @param pixels data.frame. A table of pixel records containing bin1_id and
	#' bin2_id columns.
	#' @param bins data.frame. The entire bin table.
	annotate <- function(pixels, bins) {
	bins['idx'] <- as.numeric(rownames(bins))
	pixels <- inner_join(bins, pixels, by=c("idx" = "bin2_id"))
	pixels <- pixels[,-which(names(pixels) == "idx")]
	pixels <- inner_join(bins, pixels, by=c("idx" = "bin1_id"), suffix=c("1", "2"))
	pixels <- pixels[,-which(names(pixels) == "idx")]
	return (pixels)
	}


	.arg.prune.partition <- function(x, step) {
	lo <- x[[1]]
	hi <- x[[length((x))]]
	num <- 2 + (hi - lo) / step
	cuts <- seq(lo, hi, length.out=num)
	return ( unique(findInterval(cuts, x)) )
	}


	.comes.before <- function(a0, a1, b0, b1, strict=FALSE) {
	if (a0 < b0) {
	if (strict) {
	return (a1 <= b0)
	} else {
	return (a1 <= b1)
	}
	} else {
	return (FALSE)
	}
	}


	.contains <- function(a0, a1, b0, b1, strict=FALSE) {
	if (a0 > b0 \|\| a1 < b1) {
	return (FALSE)
	}
	if (strict && (a0 == b0 \|\| a1 == b1)) {
	return (FALSE)
	}
	return (a0 <= b0 && a1 >= b1)
	}


	#' 2D range query processor
	#'
	#' Designed to read the pixel data in manageable chunks.
	#'
	#' @section Public fields:
	#' * `clr`: H5File or H5Group
	#' * `i1`, `i2`: row range (1-based)
	#' * `j1`, `j2`: column range (1-based)
	#' * `field`: value column to read (default is "count")
	#' * `chunksize`: number of pixel records per batch
	RangeQuery <- R6Class("RangeQuery",
	public = list(
	clr=NULL,
	i1=NULL,
	i2=NULL,
	j1=NULL,
	j2=NULL,
	field=NULL,
	chunksize=NULL,
	bin1_offsets=NULL,
	loc_pruned_offsets=NULL,
	n_chunks=NULL,
	initialize = function(clr, i1, i2, j1, j2, field="count", chunksize=10000000) {
	self$clr <- clr
	self$i1 <- i1
	self$i2 <- i2
	self$j1 <- j1
	self$j2 <- j2
	self$field <- field
	self$chunksize <- chunksize
	indexes <- read_indexes(clr)
	self$bin1_offsets <- indexes[["bin1_offset"]][self$i1:self$i2]

	# subsample the offsets to produce batches of pixel records roughly
	# `chunksize` in length corresponding to full matrix rows
	self$loc_pruned_offsets <- .arg.prune.partition(self$bin1_offsets, chunksize)
	self$n_chunks <- length(self$loc_pruned_offsets) - 1
	},
	read = function() {
	out <- list()
	for (i in seq_len(self$n_chunks)) {
	out[[i]] <- self$read_chunk(i)
	}
	return (rbindlist(out))
	},
	read_chunk = function(chunk_id, mirror=FALSE) {
	i1 <- self$i1
	i2 <- self$i2
	j1 <- self$j1
	j2 <- self$j2

	# extract data for a chunk of matrix rows
	oi <- self$loc_pruned_offsets[chunk_id]
	of <- self$loc_pruned_offsets[chunk_id + 1]
	p1 <- self$bin1_offsets[oi]
	p2 <- self$bin1_offsets[of] - 1
	bin2_extracted <- clr[['pixels/bin2_id']][p1:p2]
	data_extracted <- clr[['pixels/count']][p1:p2]

	# iterate over matrix rows and filter
	dfs <- list()
	for (i in seq(oi, of-1)) {
	# correct the offsets
	lo <- self$bin1_offsets[i] - p1 + 1
	hi <- self$bin1_offsets[i + 1] - p1

	if ((hi - lo) > 0) {
	# this row
	bin2 <- bin2_extracted[lo:hi] + 1 # change to 1-based
	datx <- data_extracted[lo:hi]

	if (any(is.na(bin2))) {
	print(length(bin2_extracted))
	print(length(data_extracted))

	print(self$bin1_offsets[i] - p1)
	print(self$bin1_offsets[i + 1] - p1)

	print(bin2)
	print(datx)
	}

	# filter for the range of j values we want
	mask <- ((bin2 >= j1) & (bin2 <= j2))
	cols <- bin2[mask]
	data <- datx[mask]
	rows <- rep(i1 + i - 1, length(cols)) # 1-based

	dfs[[i]] <- data.frame(
	bin1_id=rows,
	bin2_id=cols,
	count=data)
	} else {
	dfs[[i]] <- data.frame(row.names=c("bin1_id", "bin2_id", "count"))
	}
	}
	df <- rbindlist(dfs)
	if (mirror) {
	nodiag <- df[["bin1_id"]] != df[["bin2_id"]]
	df <- rbind(df,
	rename(df[nodiag,],
	bin1_id=bin2_id, bin2_id=bin1_id))
	}
	return (df)
	}
	)
	)

	#' A mirroring 2D range query processor for a symmetric matrix stored as upper
	#' triangular data.
	#'
	#' Designed to read the pixel data in manageable chunks and fill in the lower
	#' triangular data. Mirroring requires breaking up the query into one or more
	#' unmirrored subqueries.
	#'
	#' @section Public fields:
	#' * `clr`: H5File or H5Group
	#' * `i1`, `i2`: row range (1-based)
	#' * `j1`, `j2`: column range (1-based)
	#' * `field`: value column to read (default is "count")
	#' * `chunksize`: number of pixel records per batch
	SymmTriuRangeQuery <- R6Class("SymmTriuRangeQuery",
	public = list(
	clr=NULL,
	i1=NULL,
	i2=NULL,
	j1=NULL,
	j2=NULL,
	field=NULL,
	chunksize=NULL,
	transpose=NULL,
	mode=NULL,
	queries=NULL,
	initialize = function(clr, i1, i2, j1, j2,
	field="count", chunksize=10000000) {
	self$clr <- clr
	self$field <- field
	self$chunksize <- chunksize
	self$transpose <- FALSE
	if (i1 == j1 && i2 == j2) {
	self$queries <- list(
	RangeQuery$new(clr, i1, i2, j1, j2, field, chunksize)
	)
	self$mode = 1
	} else {
	if (j1 < i1 \|\| (i1 == j1 && i2 < j2)) {
	t <- i1; i1 <- j1; j1 <- t
	t <- i2; i2 <- j2; j2 <- t
	self$transpose <- TRUE
	}
	if (.comes.before(i1, i2, j1, j2, strict=TRUE)) {
	self$queries <- list(
	RangeQuery$new(clr, i1, i2, j1, j2, field, chunksize)
	)
	self$mode = 2
	} else if (.comes.before(i1, i2, j1, j2)) {
	self$queries <- list(
	RangeQuery$new(clr, i1, j1, j1, i2, field, chunksize),
	RangeQuery$new(clr, j1, i2, j1, i2, field, chunksize),
	RangeQuery$new(clr, i1, i2, i2, j2, field, chunksize)
	)
	self$mode = 3
	} else if (.contains(i1, i2, j1, j2)) {
	self$queries <- list(
	RangeQuery$new(clr, i1, j1, j1, j2, field, chunksize),
	RangeQuery$new(clr, j1, j2, j1, j2, field, chunksize),
	RangeQuery$new(clr, j1, j2, j2, i2, field, chunksize)
	)
	self$mode = 4
	} else {
	stop("this shouldn't happen")
	}
	}
	},
	read_chunk = function(q, i) {
	#' @param q int. Index of range subquery object to use.
	#' @param i int. Index of chunk to extract from subquery.
	#' @return data.frame.
	rq <- self$queries[[q]]
	mirror <- if (self$mode == 1 \|\| q == 2) TRUE else FALSE
	out <- rq$read_chunk(i, mirror=mirror)
	if (self$mode == 4 && q == 3) {
	if (!self$transpose) {
	out <- rename(out, bin1_id="bin2_id", bin2_id="bin1_id")
	}
	} else if (self$transpose) {
	out <- rename(out, bin1_id="bin2_id", bin2_id="bin1_id")
	}
	return (out)
	},
	read = function() {
	#' Process the query
	#'
	#' @return data.frame
	out <- list()
	k <- 1
	n_queries <- length(self$queries)
	for (q in seq_len(n_queries)) {
	n_chunks <- self$queries[[q]]$n_chunks
	for (i in seq_len(n_chunks)) {
	out[[k]] <- self$read_chunk(q, i)
	k <- k + 1
	}
	}
	return (rbindlist(out, use.names=TRUE))
	}
	)
	)


	# table queries
	clr <- H5File$new('~/tmp/coolers/Rao2014-GM12878-MboI-allreps-filtered.10kb.cool', mode='r')
	bins <- read_table(clr, 'bins')
	sel <- read_table(clr, 'pixels', span=c(1, 10))
	sel <- annotate(sel, bins)

	# matrix range query
	# TODO: here, bin IDs are returned as 1-based, which is inconsistent with the table queries above.
	i1 <- 3000
	i2 <- 4000
	j1 <- 3000
	j2 <- 4000
	rq <- SymmTriuRangeQuery$new(clr, i1, i2, j1, j2)
	df <- rq$read()
	m <- sparseMatrix(i=df[['bin1_id']] - i1 + 1,
	j=df[['bin2_id']] - j1 + 1,
	x=df[['count']] * (bins[df[['bin1_id']], 'weight']) * (bins[df[['bin2_id']], 'weight']),
	dims=c(i2-i1+1, j2-j1+1))
	M <- as.matrix(m)
	image(
	seq(j1, j2),
	seq(i1, i2),
	log10(t(M)),
	useRaster=TRUE,
	asp=1,
	ylim=c(i2, i1),
	col=hcl.colors(256, "YlOrRd", rev = TRUE)
	)