Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
read kallisto RNA-seq quantification into R / Bioconductor data structures
.require <-
require(pkg, character.only=TRUE, quietly=TRUE)
}, warning=function(w) {
}) || {
msg <- sprintf('install %s with
source(""); biocLite("%s")',
pkg, pkg)
stop(paste(strwrap(msg, exdent=2), collapse="\n"))
.open <- function(files, h5) {
lapply(files, function(file) {
if (h5 && H5Fis_hdf5(file))
else file(file, open="rt")
.close <- function(cons)
.close.list <- function(cons)
for (con in cons)
.close.connection <- function(cons)
.close.H5IdComponent <- function(cons)
.colData <- function(jsonfile) {
json <- fromJSON(jsonfile)"data.frame", c(json, stringsAsFactors=FALSE))
c("character", "integer" , "numeric", "numeric", "numeric")
KALLISTO_ASSAYS <- c("est_counts", "tpm", "eff_length")
.read <- function(con)
.read.connection <- function(con)
read.delim(con, header=TRUE, colClasses=.KALLISTO_COLCLASSES, row.names=1)
.read.H5IdComponent <- function(con) {
eff_length <- h5read(con, "/aux/eff_lengths")
est_counts <- h5read(con, "/est_counts")
tpm0 <- est_counts / eff_length
data.frame(row.names=h5read(con, "/aux/ids"),
length=h5read(con, "/aux/lengths"),
tpm=tpm0 / (sum(tpm0) / 1e6))
readKallisto <-
json=file.path(dirname(files), "run_info.json"),
h5=any(grepl("\\.h5$", files)),
as=c("matrix", "list", "SummarizedExperiment"))
as <- match.arg(as)
if (missing(what))
what <- what[1]
else {
whats <- eval(formals()[["what"]])
if (!all(what %in% KALLISTO_ASSAYS))
stop("'what' must be in ",
paste(sQuote(KALLISTO_ASSAYS), collapse=", "),
test <- file.exists(files)
if (!all(test))
stop("file(s) do not exist:\n ",
paste(files[!test], collapse="\n "))
if (is.null(names(files)))
names(files) <- basename(dirname(files))
if (as != "matrix") {
stopifnot(length(files) == length(json))
if (!is.null(names(json)))
stopifnot(identical(names(json), names(files)))
names(json) <- names(files)
test <- file.exists(json)
if (!all(test))
stop("json file(s) do not exist:\n ",
paste(json[!test], collapse="\n "))
if (h5)
if (as == "SummarizedExperiment") {
if (BiocInstaller::biocVersion() >= '3.2')
else .require("GenomicRanges")
cons <- .open(files, h5)
value <- .read(cons[[1]])
rowData <- value[, .KALLISTO_ROWDATA, drop=FALSE]
assay <- matrix(0, nrow(rowData), length(cons),
dimnames=list(rownames(rowData), names(cons)))
assays <- setNames(replicate(length(what), assay, FALSE), what)
for (w in what)
assays[[w]][,1] <- value[[w]]
for (i in seq_along(cons)[-1]) {
value <- .read(cons[[i]])
if (!identical(rowData, value[, .KALLISTO_ROWDATA, drop=FALSE]))
stop("rowData differs between files:\n ",
paste(files[c(1, i)], collapse="\n "))
for (w in what)
assays[[w]][,i] <- value[[w]]
if (as != "matrix")
colData <-"rbind", lapply(json, .colData))
switch(as, matrix={
if (length(assays) == 1L)
else assays
}, list={
c(setNames(list(colData, rowData), c("colData", "rowData")), assays)
}, SummarizedExperiment={
partition <-
PartitioningByEnd(integer(nrow(rowData)), names=rownames(rowData))
rowRanges <- relist(GRanges(), partition)
mcols(rowRanges) <- as(rowData, "DataFrame")
SummarizedExperiment(assays=assays, rowRanges=rowRanges,
colData=as(colData, "DataFrame"))
.readIds <- function(con, i) {
if (!missing(i))
h5read(con, "/aux/ids", list(i))
else h5read(con, "/aux/ids")
readBootstrap <-
function(file, i, j)
stopifnot(length(file) == 1L, is.character(file))
con <- H5Fopen(file)
nboot <- as.integer(h5read(con, "/aux/num_bootstrap"))
if (nboot == 0L)
stop("file contains no bootstraps:\n ", file)
if (!missing(i) && is.character(i)) {
idx <- match(i, .readIds(con))
if (anyNA(i))
stop("unknown target id(s)", i[])
i <- idx
if (!missing(j) && is.numeric(j)) {
if (any((j < 1L) || any(j > nboot)))
stop("'j' must be >0 and <=", nboot)
j <- paste0("bs", as.integer(j) - 1L)
m <- if (missing(i) && missing(j)) {
simplify2array(h5read(con, "/bootstrap"))
} else if (missing(i)) {
query <- setNames(sprintf("/bootstrap/%s", j), j)
simplify2array(lapply(query, h5read, file=con))
} else if (missing(j)) {
group <- H5Gopen(con, "/bootstrap")
name <- h5ls(group)$name
query <- setNames(sprintf("/bootstrap/%s", name), name)
simplify2array(lapply(query, h5read, file=con, index=list(i)))
} else {
query <- setNames(sprintf("/bootstrap/%s", j), j)
simplify2array(lapply(query, h5read, file=con, index=list(i)))
rownames(m) <- .readIds(con, i)
if (missing(j)) {
o <- order(as.integer(sub("bs", "", colnames(m), fixed=TRUE)))
} else m
files <- dir("~/a/kallisto/example", "abundance.tsv", full=TRUE,
str(readKallisto(files, as="matrix"))
str(readKallisto(files, as="list"))
(se <- readKallisto(files, as="SummarizedExperiment"))
str(readKallisto(files, what="eff_length"))
readKallisto(files, what="eff_length", as="SummarizedExperiment")
files <- sub(".tsv", ".h5", files, fixed=TRUE)
str(readKallisto(files, what="tpm"))
readKallisto(files, what="tpm", as="SummarizedExperiment")
readKallisto(files, what=c("tpm", "eff_length"), as="SummarizedExperiment")
xx <- readBootstrap(files[1])
ridx <- head(which(rowSums(xx) != 0), 3)
cidx <- c(1:5, 96:100)
readBootstrap(files[1])[ridx, cidx]
readBootstrap(files[1], i=c(ridx, rev(ridx)), j=cidx)
Copy link

seandavi commented Jun 9, 2015

Hi, Martin. Thanks for putting this together. I always learn a ton from reading your code.

The check on line 114 includes both "length" and "eff_length" in rowData. I think that the "eff_length" will, in general, vary between samples. This means that for more than one sample, readKallisto always stops at line 115. So, probably rowData should include just length and the "eff_length", if you want to keep it, would need to go in another 2-D container. See my fork of this gist for a quick fix.

Copy link

mtmorgan commented Jun 10, 2015

Thanks Sean I updated so that eff_length can be specified with 'what='

Copy link

seandavi commented Jul 18, 2015

Just a note on installing into R--very easy....


Copy link

vjcitn commented Dec 16, 2015

Is there any inclination to package this code? I have obtained the fastq files underlying the RNAseqData.HNRNPC... experiment data package with the aim of comparing a count-based analysis to a Kallisto-based analysis. Would a workflow document that carefully manages the fastq plus sample-level data and takes the information downstream to a SummarizedExperiment be of interest? Or perhaps one already exists?

Copy link

vjcitn commented Dec 16, 2015

i see now that sleuth handles parsing -> shiny. there may not be much scope for packaging the parsing code ... but still, the role of bioc object discipline warrants exploration.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment