Skip to content

Instantly share code, notes, and snippets.

@grst
Created June 13, 2023 17:55
Show Gist options
  • Save grst/652f0881d8c2db71c42260d892b73cb7 to your computer and use it in GitHub Desktop.
Save grst/652f0881d8c2db71c42260d892b73cb7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b1a72797-9348-4d50-bcd8-41fdb6e50e3a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"from itertools import islice\n",
"from pathlib import Path\n",
"\n",
"import anndata\n",
"import awkward as ak\n",
"import ijson\n",
"import mudata\n",
"import pandas as pd\n",
"import scirpy as ir\n",
"from scipy.sparse import coo_matrix"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "68dd0c76-05eb-43ab-8160-aee2fc5b6f39",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"input_dir = Path(\"/home/sturm/Downloads/Yost-et-al-airr/\")\n",
"output_dir = input_dir / \"out\"\n",
"output_dir.mkdir(exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2f1076c4-e84b-423e-ae20-8f862d7f31e3",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 4.6G\n",
"-rw-r--r-- 1 sturm sturm 744 Apr 18 16:34 AIRR-manifest.json\n",
"-rw-r--r-- 1 sturm sturm 378 Apr 18 16:34 info.txt\n",
"-rw-r--r-- 1 sturm sturm 2.4M Apr 18 16:31 ireceptor-public-archive-cell.json\n",
"-rw-r--r-- 1 sturm sturm 4.6G Apr 18 16:34 ireceptor-public-archive-gex.json\n",
"-rw-r--r-- 1 sturm sturm 45K Apr 18 16:34 ireceptor-public-archive-metadata.json\n",
"-rw-r--r-- 1 sturm sturm 21M Apr 18 16:34 ireceptor-public-archive-rearrangement.tsv\n",
"drwxr-xr-x 2 sturm sturm 4.0K Jun 13 19:38 out\n"
]
}
],
"source": [
"!ls -lh {input_dir}"
]
},
{
"cell_type": "markdown",
"id": "49add0dc-61da-418e-8c18-9c14fc034bb0",
"metadata": {},
"source": [
"Let's read in the following data: \n",
" * ireceptor-public-archive-cell.json -> Cell-level metadata\n",
" * ireceptor-public-archive-gex.json -> Gene Expression AnnData\n",
" * ireceptor-public-archive-rearrangement.tsv -> AIRR AnnData\n",
" * ireceptor-public-archive-metadata.json -> Sample-level metadata"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "93285834-b121-4ceb-aab9-a27b78916336",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"def make_gex_anndata(objects, cell_info):\n",
" var_symbol = {}\n",
" var_index = {}\n",
" obs_index = {}\n",
"\n",
" def _make_tuples():\n",
" for o in objects:\n",
" cell_id = o[\"cell_id\"]\n",
" ensg = o[\"property\"][\"id\"]\n",
" umi_count = o[\"value\"]\n",
" try:\n",
" obs_idx = obs_index[cell_id]\n",
" except KeyError:\n",
" obs_idx = obs_index[cell_id] = len(obs_index)\n",
" try:\n",
" var_idx = var_index[ensg]\n",
" except KeyError:\n",
" var_idx = var_index[ensg] = len(var_index)\n",
" var_symbol[ensg] = o[\"property\"][\"label\"]\n",
" yield umi_count, (obs_idx, var_idx)\n",
"\n",
" data, indices = zip(*_make_tuples())\n",
" obs_indices, var_indices = zip(*indices)\n",
" X = coo_matrix(\n",
" (data, (obs_indices, var_indices)), shape=(len(obs_index), len(var_index))\n",
" ).tocsr()\n",
" var = pd.DataFrame({\"idx\": var_index, \"symbol\": var_symbol}).sort_values(\"idx\")\n",
" var.index = var.index.map(lambda x: x.split(\":\")[1])\n",
" obs = (\n",
" pd.DataFrame({\"idx\": obs_index})\n",
" .sort_values(\"idx\")\n",
" .join(cell_info, how=\"left\", validate=\"one_to_one\")\n",
" )\n",
" adata = anndata.AnnData(X=X, obs=obs, var=var)\n",
" adata.strings_to_categoricals()\n",
" return adata"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4244b9ff-5232-4d29-a315-f507489ec984",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 17.7 ms, sys: 1.85 ms, total: 19.6 ms\n",
"Wall time: 19.2 ms\n"
]
}
],
"source": [
"%%time\n",
"with open(input_dir / \"ireceptor-public-archive-cell.json\", \"rb\") as f:\n",
" objects = ijson.items(f, \"Cell.item\")\n",
" cell_info = pd.DataFrame(objects).set_index(\"cell_id\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "fd6336b4-625a-4131-a7b8-6491068d6c95",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 259 µs, sys: 170 µs, total: 429 µs\n",
"Wall time: 416 µs\n"
]
}
],
"source": [
"%%time\n",
"with open(input_dir / \"ireceptor-public-archive-metadata.json\", \"rb\") as f:\n",
" metadata = json.load(f)\n",
" # metadata = ak.from_json(f)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "32f907f9-a886-4178-80b9-e9914d100501",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 29.5 s, sys: 864 ms, total: 30.3 s\n",
"Wall time: 30.4 s\n"
]
}
],
"source": [
"%%time\n",
"with open(input_dir / \"ireceptor-public-archive-gex.json\", \"rb\") as f:\n",
" objects = ijson.items(f, \"CellExpression.item\")\n",
" adata = make_gex_anndata(objects, cell_info)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9493fa31-3945-4d15-b3cd-5faaf3fab0f0",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.28 s, sys: 72.5 ms, total: 1.35 s\n",
"Wall time: 1.33 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/sturm/anaconda3/envs/scirpy_dev/lib/python3.9/site-packages/anndata/_core/aligned_mapping.py:54: ExperimentalFeatureWarning: Support for Awkward Arrays is currently experimental. Behavior may change in the future. Please report any issues you may encounter!\n",
" warnings.warn(\n"
]
}
],
"source": [
"%%time\n",
"adata_ir = ir.io.read_airr(input_dir / \"ireceptor-public-archive-rearrangement.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1135ba21-a56f-4c49-b1a0-f6f64f298a1e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"mdata = mudata.MuData(\n",
" {\"gex\": adata, \"airr\": adata_ir},\n",
" # store full metadata as json string\n",
" uns={\"metadata\": json.dumps(metadata)},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e08b3255-7e76-4001-991d-ed06acce9f78",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 125 ms, sys: 76.4 ms, total: 202 ms\n",
"Wall time: 201 ms\n"
]
}
],
"source": [
"%%time\n",
"mdata.write_h5mu(output_dir / \"mdata.h5mu\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8cdcff26-700b-49f8-a9b9-90f7b07a4c4b",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 122M\n",
"-rw-r--r-- 1 sturm sturm 122M Jun 13 19:54 mdata.h5mu\n"
]
}
],
"source": [
"!ls -lh {output_dir}"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:scirpy_dev]",
"language": "python",
"name": "conda-env-scirpy_dev-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@grst
Copy link
Author

grst commented Jul 23, 2023

This is a prototype of a streaming parser of the AIRR JSON format leveraging the ijson library.

It takes ~35 seconds on my laptop to create a mudata object including gene expression and
AIRR data on my laptop.

What is currently not handled properly is the sample-level metadata. It is just dumped as a json string
in mudata.uns["metadata"]​. Ann/MuData doesn't have the concept of having sample rather than cell metadata.
I think for now the best solution would be to transfer the metadata of interest (say, the patient's sex and age) to
adata.obs​ as cell-level metadata. Since this will be stored as categoricals, it will be quite space efficent, also for larger datasets.

@grst
Copy link
Author

grst commented Jul 23, 2023

Tagging @bcorrie here, since I'm still unsure if you received my email.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment