Skip to content

Instantly share code, notes, and snippets.

@slowkow
Last active March 20, 2023 11:17
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save slowkow/d3c4b77c9bf2a75f6dad4843d7d3aefc to your computer and use it in GitHub Desktop.
Save slowkow/d3c4b77c9bf2a75f6dad4843d7d3aefc to your computer and use it in GitHub Desktop.
Read Cell Ranger HDF5 .h5 files in R
# install.packages(c("Matrix", "rhdf5", "tidyverse"))
library(Matrix)
library(rhdf5)
library(tidyverse)
library(glue)
my_h5_files <- Sys.glob(
"path/to/cellranger-per-channel/output/*/filtered_feature_bc_matrix.h5"
)
my_counts_list <- lapply(my_h5_files, function(my_h5_file) {
# Each cell gets a barcode nucleotide sequence.
# e.g. "AAACCTGAGACCGGAT-1"
my_barcodes <- as.character(h5read(my_h5_file, "matrix/barcodes"))
# We'll assume that the name of the folder containing the .h5 file
# is the name of the channel (sample).
my_channel <- basename(dirname(my_h5_file))
# Prepend the sample name to the barcodes, so we can safely
# merge multiple samples into a single matrix.
my_barcodes <- sprintf("%s|%s", my_channel, my_barcodes)
# Read the data into a dgCMatrix (sparse matrix).
h5 <- h5read(my_h5_file, "matrix")
counts <- sparseMatrix(
dims = h5$shape,
i = as.numeric(h5$indices),
p = as.numeric(h5$indptr),
x = as.numeric(h5$data),
index1 = FALSE
)
colnames(counts) <- my_barcodes
rownames(counts) <- as.data.frame(h5[["features"]])$id
return(counts)
})
# Make sure all the count matrices have the same dimensions
stopifnot(length(unique(lapply(my_counts_list, dim))) == 1)
# Concatenate the matrices into one big matrix.
counts <- do.call(cbind, my_counts_list)
rm(my_counts_list)
# Compute the sparsity of the matrix.
sparsity <- 1 - length(counts@x) / (as.numeric(nrow(counts)) * ncol(counts))
n_genes <- nrow(counts)
n_cells <- ncol(counts)
print(glue("Counts matrix has {comma(n_genes)} genes and {comma(n_cells)} cells"))
print(glue("Counts matrix is {signif(100 * sparsity, 3)}% sparse"))
# We'll also want to get the corresponding gene information from the .h5 file.
genes <- h5read(my_h5_files[1], "matrix/features")
genes <- tibble(ensembl_id = genes$id, symbol = genes$name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment