Skip to content

Instantly share code, notes, and snippets.

@aaronwolen
Last active July 28, 2022 16:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aaronwolen/79a61ed370460837304b37fcd32ee86a to your computer and use it in GitHub Desktop.
Save aaronwolen/79a61ed370460837304b37fcd32ee86a to your computer and use it in GitHub Desktop.
#!/usr/bin/env Rscript
library(docopt)
# Changelog
# ---------
# 0.1.1: Consolidate fragment metadata and commits after ingestion
# 0.1.0: Initial release
# docopt ------------------------------------------------------------------
doc <- "Ingest transcript coordinates into TileDB.
Usage: txingest.R --uri <uri> [--overwrite] [--verbose] [--help] <filepath>
-h --help Show this help text
-u --uri TileDB array URI
-v --verbose Verbose output
-o --overwrite Overwrite existing array
filepath Path to transcript coordinates file
"
version <- numeric_version('0.1.0')
opt <- docopt(doc, version = sprintf("Transcript coordinates ingestor. v%s", version))
# packages ----------------------------------------------------------------
library(tools)
library(tiledb)
library(prettyunits)
library(data.table)
# validate ----------------------------------------------------------------
stopifnot(
"No file exists at 'filepath'" = file.exists(opt$filepath)
)
# setup -------------------------------------------------------------------
if (opt$overwrite) {
if (tiledb_vfs_is_dir(opt$uri)) {
if (opt$verbose) {
message(sprintf("Deleting existing array at '%s'", opt$uri))
}
tiledb_vfs_remove_dir(opt$uri)
}
}
index_cols <- c("fov", "x_local_px", "y_local_px")
# main --------------------------------------------------------------------
if (opt$verbose) {
message(sprintf("Reading transcript coordinates from '%s'", opt$filepath))
}
tbl_tx <- data.table::fread(opt$filepath, key = index_cols)
stopifnot(
"Transcript coordinates file does not contain expected columns" =
all(index_cols %in% colnames(tbl_tx))
)
# Number of observations per fov
if (opt$verbose) {
message("Number of observations per fov:")
print(tbl_tx[, .N, by = fov])
}
# # fov/x/y
# tbl_tx_dupes <- tbl_tx[duplicated(tbl_tx, by = index_cols)]
# if (nrow(tbl_tx_dupes) > 0) {
# warning(
# sprintf("Removing %i duplicate rows found in %s", nrow(tbl_tx_dupes), input_file)
# )
# tbl_tx <- unique(tbl_tx, by = index_cols)
# }
# Data is highly sparse, nnz = 5%
# smat <- with(
# subset(tbl_tx, fov == 1),
# Matrix::sparseMatrix(i = x_local_px, j = y_local_px)
# )
# Matrix::nnzero(smat) / length(smat)
if (opt$verbose) {
message(sprintf("Creating array at '%s'", opt$uri))
}
tiledb::fromDataFrame(
obj = tbl_tx,
uri = opt$uri,
col_index = index_cols,
mode = "schema_only",
# transcript data contains duplicates
allows_dups = TRUE,
sparse = TRUE
)
if (opt$verbose) {
message("Ingesting data in fov-delimited chunks")
}
start_time <- Sys.time()
fovs <- unique(tbl_tx$fov)
for (i in seq_along(fovs)) {
if (opt$verbose) message(sprintf("...ingesting fov: %i", i))
tiledb::fromDataFrame(
obj = subset(tbl_tx, fov == fovs[i]),
uri = opt$uri,
col_index = index_cols,
mode = "append"
)
}
end_time <- Sys.time()
msg <- sprintf(
"Finished ingesting transcript coordinates %f sec. Array size: %s MB",
as.numeric(difftime(end_time, start_time, units = "secs")),
round(tiledb::tiledb_vfs_dir_size(opt$uri) * 1e-6, 2)
)
if (opt$verbose) {
message(msg)
message("Array schema:")
print(tiledb::schema(opt$uri))
}
if (opt$verbose) message("Consolidating fragment metadata")
cfg <- tiledb_config(config = c(sm.consolidation.mode = "fragment_meta"))
array_consolidate(opt$uri, cfg)
if (opt$verbose) message("Consolidating commits")
cfg <- tiledb_config(config = c(sm.consolidation.mode = "commits"))
array_consolidate(opt$uri, cfg)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment