Skip to content

Instantly share code, notes, and snippets.

@mtmorgan
Last active December 4, 2017 11:51
Show Gist options
  • Save mtmorgan/eaf456ad5b45c431c494 to your computer and use it in GitHub Desktop.
Save mtmorgan/eaf456ad5b45c431c494 to your computer and use it in GitHub Desktop.
read kallisto RNA-seq quantification into R / Bioconductor data structures
.require <-
function(pkg)
{
withCallingHandlers({
suppressPackageStartupMessages({
require(pkg, character.only=TRUE, quietly=TRUE)
})
}, warning=function(w) {
invokeRestart("muffleWarning")
}) || {
msg <- sprintf('install %s with
source("http://bioconductor.org/biocLite.R"); biocLite("%s")',
pkg, pkg)
stop(paste(strwrap(msg, exdent=2), collapse="\n"))
}
}
.open <- function(files, h5) {
lapply(files, function(file) {
if (h5 && H5Fis_hdf5(file))
H5Fopen(file)
else file(file, open="rt")
})
}
.close <- function(cons)
UseMethod(".close")
.close.list <- function(cons)
for (con in cons)
.close(con)
.close.connection <- function(cons)
close(cons)
.close.H5IdComponent <- function(cons)
H5Fclose(cons)
.colData <- function(jsonfile) {
json <- fromJSON(jsonfile)
do.call("data.frame", c(json, stringsAsFactors=FALSE))
}
.KALLISTO_COLCLASSES <-
c("character", "integer" , "numeric", "numeric", "numeric")
.KALLISTO_ROWDATA <- "length"
KALLISTO_ASSAYS <- c("est_counts", "tpm", "eff_length")
.read <- function(con)
UseMethod(".read")
.read.connection <- function(con)
read.delim(con, header=TRUE, colClasses=.KALLISTO_COLCLASSES, row.names=1)
.read.H5IdComponent <- function(con) {
eff_length <- h5read(con, "/aux/eff_lengths")
est_counts <- h5read(con, "/est_counts")
tpm0 <- est_counts / eff_length
data.frame(row.names=h5read(con, "/aux/ids"),
length=h5read(con, "/aux/lengths"),
eff_length=eff_length,
est_counts=est_counts,
tpm=tpm0 / (sum(tpm0) / 1e6))
}
readKallisto <-
function(files,
json=file.path(dirname(files), "run_info.json"),
h5=any(grepl("\\.h5$", files)),
what=KALLISTO_ASSAYS,
as=c("matrix", "list", "SummarizedExperiment"))
{
as <- match.arg(as)
if (missing(what))
what <- what[1]
else {
whats <- eval(formals()[["what"]])
if (!all(what %in% KALLISTO_ASSAYS))
stop("'what' must be in ",
paste(sQuote(KALLISTO_ASSAYS), collapse=", "),
call.=FALSE)
}
stopifnot(is.character(files))
test <- file.exists(files)
if (!all(test))
stop("file(s) do not exist:\n ",
paste(files[!test], collapse="\n "))
if (is.null(names(files)))
names(files) <- basename(dirname(files))
if (as != "matrix") {
.require("jsonlite")
stopifnot(length(files) == length(json))
if (!is.null(names(json)))
stopifnot(identical(names(json), names(files)))
else
names(json) <- names(files)
test <- file.exists(json)
if (!all(test))
stop("json file(s) do not exist:\n ",
paste(json[!test], collapse="\n "))
}
if (h5)
.require("rhdf5")
if (as == "SummarizedExperiment") {
if (BiocInstaller::biocVersion() >= '3.2')
.require("SummarizedExperiment")
else .require("GenomicRanges")
}
cons <- .open(files, h5)
value <- .read(cons[[1]])
rowData <- value[, .KALLISTO_ROWDATA, drop=FALSE]
assay <- matrix(0, nrow(rowData), length(cons),
dimnames=list(rownames(rowData), names(cons)))
assays <- setNames(replicate(length(what), assay, FALSE), what)
for (w in what)
assays[[w]][,1] <- value[[w]]
for (i in seq_along(cons)[-1]) {
value <- .read(cons[[i]])
if (!identical(rowData, value[, .KALLISTO_ROWDATA, drop=FALSE]))
stop("rowData differs between files:\n ",
paste(files[c(1, i)], collapse="\n "))
for (w in what)
assays[[w]][,i] <- value[[w]]
}
.close(cons)
if (as != "matrix")
colData <- do.call("rbind", lapply(json, .colData))
switch(as, matrix={
if (length(assays) == 1L)
assays[[1]]
else assays
}, list={
c(setNames(list(colData, rowData), c("colData", "rowData")), assays)
}, SummarizedExperiment={
partition <-
PartitioningByEnd(integer(nrow(rowData)), names=rownames(rowData))
rowRanges <- relist(GRanges(), partition)
mcols(rowRanges) <- as(rowData, "DataFrame")
SummarizedExperiment(assays=assays, rowRanges=rowRanges,
colData=as(colData, "DataFrame"))
})
}
.readIds <- function(con, i) {
if (!missing(i))
h5read(con, "/aux/ids", list(i))
else h5read(con, "/aux/ids")
}
readBootstrap <-
function(file, i, j)
{
.require("rhdf5")
stopifnot(length(file) == 1L, is.character(file))
stopifnot(file.exists(file))
stopifnot(H5Fis_hdf5(file))
con <- H5Fopen(file)
on.exit(H5Fclose(con))
nboot <- as.integer(h5read(con, "/aux/num_bootstrap"))
if (nboot == 0L)
stop("file contains no bootstraps:\n ", file)
if (!missing(i) && is.character(i)) {
idx <- match(i, .readIds(con))
if (anyNA(i))
stop("unknown target id(s)", i[is.na(idx)])
i <- idx
}
if (!missing(j) && is.numeric(j)) {
if (any((j < 1L) || any(j > nboot)))
stop("'j' must be >0 and <=", nboot)
j <- paste0("bs", as.integer(j) - 1L)
}
m <- if (missing(i) && missing(j)) {
simplify2array(h5read(con, "/bootstrap"))
} else if (missing(i)) {
query <- setNames(sprintf("/bootstrap/%s", j), j)
simplify2array(lapply(query, h5read, file=con))
} else if (missing(j)) {
group <- H5Gopen(con, "/bootstrap")
name <- h5ls(group)$name
H5Gclose(group)
query <- setNames(sprintf("/bootstrap/%s", name), name)
simplify2array(lapply(query, h5read, file=con, index=list(i)))
} else {
query <- setNames(sprintf("/bootstrap/%s", j), j)
simplify2array(lapply(query, h5read, file=con, index=list(i)))
}
rownames(m) <- .readIds(con, i)
if (missing(j)) {
o <- order(as.integer(sub("bs", "", colnames(m), fixed=TRUE)))
m[,o]
} else m
}
source("readKallisto.R")
files <- dir("~/a/kallisto/example", "abundance.tsv", full=TRUE,
recursive=TRUE)
stopifnot(all(file.exists(files)))
str(readKallisto(files, as="matrix"))
str(readKallisto(files, as="list"))
(se <- readKallisto(files, as="SummarizedExperiment"))
str(readKallisto(files, what="eff_length"))
readKallisto(files, what="eff_length", as="SummarizedExperiment")
files <- sub(".tsv", ".h5", files, fixed=TRUE)
str(readKallisto(files))
str(readKallisto(files, what="tpm"))
readKallisto(files, what="tpm", as="SummarizedExperiment")
readKallisto(files, what=c("tpm", "eff_length"), as="SummarizedExperiment")
xx <- readBootstrap(files[1])
ridx <- head(which(rowSums(xx) != 0), 3)
cidx <- c(1:5, 96:100)
readBootstrap(files[1])[ridx, cidx]
readBootstrap(files[1], i=c(ridx, rev(ridx)), j=cidx)
@vjcitn
Copy link

vjcitn commented Dec 16, 2015

i see now that sleuth handles parsing -> shiny. there may not be much scope for packaging the parsing code ... but still, the role of bioc object discipline warrants exploration.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment