Skip to content

Instantly share code, notes, and snippets.

@aaronwolen
Last active July 11, 2022 12:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aaronwolen/46390cf5c5a4f9d5fbf2d9a27dd768d8 to your computer and use it in GitHub Desktop.
Save aaronwolen/46390cf5c5a4f9d5fbf2d9a27dd768d8 to your computer and use it in GitHub Desktop.
allen-institute-merscope

README

Setup

Use conda to install the TileDB Python API and other dependencies.

mamba env create -f environment.yaml
conda activate aind-demo

Instructions

The ingest-metadata.py script requires two arguments:

  1. metadata_dir: Directory containing processed metadata for a particular mouse.
  2. array_uri: Location where the new array will be created. This can be a local file path for S3 URI.

Each subdirectory within metadata_dir represents one coronal section of the mouse brain and is named according to the following convention: 202203030920_60988223_VMSC01601, where 609882 is the mouse ID and 23 is the section number. The processed_metadata.csv file within each subdirectory contains the data to be ingested.

Example directory:

> tree data/mouse_1
data/mouse_1
├── 202202170851_60988201_VMSC01001
│   └── metadata_processed.csv.gz
├── 202202170851_609882HK01_VMSC01001
│   └── metadata_processed.csv.gz
├── 202202170855_60988202_VMSC01601
│   └── metadata_processed.csv.gz
└── 202202170915_60988203_VMSC00401
    └── metadata_processed.csv.gz

You can run the ingest-metadata.py script with the following command:

> python ingest-metadata.py data/mouse_1 data/arrays/mouse01_processed_metadata

[2022-07-07 06:41:32,967][INFO]: Loading csv 0: data/mouse_1/202202170851_60988201_VMSC01001/metadata_processed.csv.gz
[2022-07-07 06:41:35,925][INFO]: Creating array data/arrays/mouse01_processed_metadata
[2022-07-07 06:41:36,203][INFO]: Ingesting metadata for section 0
[2022-07-07 06:41:38,549][INFO]: Ingested 198093 records
[2022-07-07 06:41:38,549][INFO]: Loading csv 1: data/mouse_1/202202170855_60988202_VMSC01601/metadata_processed.csv.gz
[2022-07-07 06:41:41,272][INFO]: Ingesting metadata for section 1
[2022-07-07 06:41:43,377][INFO]: Ingested 173331 records
[2022-07-07 06:41:43,377][INFO]: Loading csv 2: data/mouse_1/202202170915_60988203_VMSC00401/metadata_processed.csv.gz
[2022-07-07 06:41:45,511][INFO]: Ingesting metadata for section 2
[2022-07-07 06:41:47,169][INFO]: Ingested 137213 records
[2022-07-07 06:41:47,170][INFO]: Finished ingesting all csv files
name: aind-demo
dependencies:
- python<3.8
- pip
- numpy
- scipy
- pandas
- tiledb-py
- pip:
- tiledb-cloud
import argparse
from typing import Tuple
def ingest_metadata(
metadata_dir: str,
array_uri: str,
verbose: bool = False,
) -> None:
"""
Ingest each coronal section's metadata into TileDB.
"""
import re
from pathlib import Path
import logging as log
import numpy
import pandas
import tiledb
vfs = tiledb.VFS()
def parse_ids_from_filename(dirname: str) -> Tuple[int]:
"""
Parse the mouse/section identifiers from the directory name.
e.g., 202202170851_[mouse:609882][section:01]_VMSC01001
"""
full_id = dirname.split("_")[1]
return (int(full_id[:5]), int(full_id[6:]))
def metadata_to_dataframe(metadata_path: str) -> pandas.DataFrame:
"""
Parse the metadata file into a Pandas dataframe suitable for ingesting into TileDB.
"""
df = pandas.read_csv(metadata_path, index_col=0)
# remove empty (or mostly emptpy) columns
df.drop(labels=["seg_qc", "fb_ROI"], axis="columns", inplace=True)
# add mouse/section identifiers
mouse_id, section_id = parse_ids_from_filename(csv.parent.name)
df["mouse_id"] = mouse_id
df["section_id"] = section_id
# index by section and cell
df.index.rename("cell_id", inplace=True)
df.reset_index(inplace=True)
df.set_index(["section_id", "cell_id"], inplace=True)
# tiledb.from_pandas doesn't curently support nullable var-length attributes, so we need to fill in the missing values
df.wb_10x_cluster_label.fillna(value="Other", inplace=True)
df.wb_10x_cluster_color.fillna(value="#000000", inplace=True)
df.wb_10x_class_label.fillna(value="Other", inplace=True)
df.wb_10x_class_color.fillna(value="#000000", inplace=True)
df.wb_10x_subclass_label.fillna(value="Other", inplace=True)
df.wb_10x_subclass_color.fillna(value="#000000", inplace=True)
df.wb_10x_broad_region.fillna(value="Other", inplace=True)
df["wb_10x_max.region"].fillna(value="Other", inplace=True)
return df
# Set log formatting
format_str = "[%(asctime)s][%(levelname)s]: %(message)s"
log_level = log.INFO
if verbose:
log_level = log.DEBUG
log.basicConfig(level=log_level, format=format_str)
# assert the array doesn't already exist
# assert vfs.is_dir(array_uri) is False, "Array already exists"
if vfs.is_dir(array_uri):
log.info("Array already exists. Deleting.")
vfs.remove_dir(array_uri)
# assert the metadata directory exists
metadata_dir = Path(metadata_dir)
assert metadata_dir.exists(), "Metadata directory does not exist."
csvs = sorted([f for f in metadata_dir.glob("**/metadata_processed.csv.gz")])
# remove csvs in directories that don't match the expected pattern
# eg., 202202170851_60988201_VMSC01001
csvs = [f for f in csvs if re.match(r"^\d{12}_\d{8}", f.parent.name)]
assert len(csvs) > 0, "No metadata files found in directory."
for i, csv in enumerate(csvs):
log.info(f"Loading csv {i}: {csv}")
df = metadata_to_dataframe(csv)
# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
# Force ASCII storage if string to make columns queryable.
col_types = {
col: "ascii"
for col in df.columns
if df[col].dtype.name == "object"
}
if i == 0:
log.info(f"Creating array {array_uri}")
tiledb.from_pandas(
uri=array_uri,
dataframe=df,
mode="schema_only",
sparse=True,
allows_duplicates=False,
full_domain=True,
column_types=col_types,
)
if verbose:
log.debug(f"Array {array_uri} schema:")
with tiledb.open(array_uri) as A:
print(A.schema)
log.info(f"Ingesting metadata for section {i}")
tiledb.from_pandas(
uri=array_uri,
dataframe=df,
mode="append",
column_types=col_types,
)
log.info(f"Ingested {df.shape[0]} records")
log.info("Finished ingesting all csv files")
def main():
"""Ingest processed metadata into TileDB."""
p = argparse.ArgumentParser()
p.add_argument(
"metadata_dir",
type=str,
help="Directory containing processed metadata files",
)
p.add_argument("array_uri", type=str, help="URI for the new TileDB array")
p.add_argument("--verbose", help="verbose logging", action="store_true")
args = p.parse_args()
ingest_metadata(
metadata_dir=args.metadata_dir,
array_uri=args.array_uri,
verbose=args.verbose,
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment