grst/airr-json.ipynb

## airr-json.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b1a72797-9348-4d50-bcd8-41fdb6e50e3a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import json\n",
    "from itertools import islice\n",
    "from pathlib import Path\n",
    "\n",
    "import anndata\n",
    "import awkward as ak\n",
    "import ijson\n",
    "import mudata\n",
    "import pandas as pd\n",
    "import scirpy as ir\n",
    "from scipy.sparse import coo_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "68dd0c76-05eb-43ab-8160-aee2fc5b6f39",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "input_dir = Path(\"/home/sturm/Downloads/Yost-et-al-airr/\")\n",
    "output_dir = input_dir / \"out\"\n",
    "output_dir.mkdir(exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2f1076c4-e84b-423e-ae20-8f862d7f31e3",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 4.6G\n",
      "-rw-r--r-- 1 sturm sturm  744 Apr 18 16:34 AIRR-manifest.json\n",
      "-rw-r--r-- 1 sturm sturm  378 Apr 18 16:34 info.txt\n",
      "-rw-r--r-- 1 sturm sturm 2.4M Apr 18 16:31 ireceptor-public-archive-cell.json\n",
      "-rw-r--r-- 1 sturm sturm 4.6G Apr 18 16:34 ireceptor-public-archive-gex.json\n",
      "-rw-r--r-- 1 sturm sturm  45K Apr 18 16:34 ireceptor-public-archive-metadata.json\n",
      "-rw-r--r-- 1 sturm sturm  21M Apr 18 16:34 ireceptor-public-archive-rearrangement.tsv\n",
      "drwxr-xr-x 2 sturm sturm 4.0K Jun 13 19:38 out\n"
     ]
    }
   ],
   "source": [
    "!ls -lh {input_dir}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49add0dc-61da-418e-8c18-9c14fc034bb0",
   "metadata": {},
   "source": [
    "Let's read in the following data: \n",
    " * ireceptor-public-archive-cell.json -> Cell-level metadata\n",
    " * ireceptor-public-archive-gex.json -> Gene Expression AnnData\n",
    " * ireceptor-public-archive-rearrangement.tsv -> AIRR AnnData\n",
    " * ireceptor-public-archive-metadata.json -> Sample-level metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "93285834-b121-4ceb-aab9-a27b78916336",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def make_gex_anndata(objects, cell_info):\n",
    "    var_symbol = {}\n",
    "    var_index = {}\n",
    "    obs_index = {}\n",
    "\n",
    "    def _make_tuples():\n",
    "        for o in objects:\n",
    "            cell_id = o[\"cell_id\"]\n",
    "            ensg = o[\"property\"][\"id\"]\n",
    "            umi_count = o[\"value\"]\n",
    "            try:\n",
    "                obs_idx = obs_index[cell_id]\n",
    "            except KeyError:\n",
    "                obs_idx = obs_index[cell_id] = len(obs_index)\n",
    "            try:\n",
    "                var_idx = var_index[ensg]\n",
    "            except KeyError:\n",
    "                var_idx = var_index[ensg] = len(var_index)\n",
    "                var_symbol[ensg] = o[\"property\"][\"label\"]\n",
    "            yield umi_count, (obs_idx, var_idx)\n",
    "\n",
    "    data, indices = zip(*_make_tuples())\n",
    "    obs_indices, var_indices = zip(*indices)\n",
    "    X = coo_matrix(\n",
    "        (data, (obs_indices, var_indices)), shape=(len(obs_index), len(var_index))\n",
    "    ).tocsr()\n",
    "    var = pd.DataFrame({\"idx\": var_index, \"symbol\": var_symbol}).sort_values(\"idx\")\n",
    "    var.index = var.index.map(lambda x: x.split(\":\")[1])\n",
    "    obs = (\n",
    "        pd.DataFrame({\"idx\": obs_index})\n",
    "        .sort_values(\"idx\")\n",
    "        .join(cell_info, how=\"left\", validate=\"one_to_one\")\n",
    "    )\n",
    "    adata = anndata.AnnData(X=X, obs=obs, var=var)\n",
    "    adata.strings_to_categoricals()\n",
    "    return adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4244b9ff-5232-4d29-a315-f507489ec984",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 17.7 ms, sys: 1.85 ms, total: 19.6 ms\n",
      "Wall time: 19.2 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "with open(input_dir / \"ireceptor-public-archive-cell.json\", \"rb\") as f:\n",
    "    objects = ijson.items(f, \"Cell.item\")\n",
    "    cell_info = pd.DataFrame(objects).set_index(\"cell_id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fd6336b4-625a-4131-a7b8-6491068d6c95",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 259 µs, sys: 170 µs, total: 429 µs\n",
      "Wall time: 416 µs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "with open(input_dir / \"ireceptor-public-archive-metadata.json\", \"rb\") as f:\n",
    "    metadata = json.load(f)\n",
    "    # metadata = ak.from_json(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "32f907f9-a886-4178-80b9-e9914d100501",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 29.5 s, sys: 864 ms, total: 30.3 s\n",
      "Wall time: 30.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "with open(input_dir / \"ireceptor-public-archive-gex.json\", \"rb\") as f:\n",
    "    objects = ijson.items(f, \"CellExpression.item\")\n",
    "    adata = make_gex_anndata(objects, cell_info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9493fa31-3945-4d15-b3cd-5faaf3fab0f0",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.28 s, sys: 72.5 ms, total: 1.35 s\n",
      "Wall time: 1.33 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sturm/anaconda3/envs/scirpy_dev/lib/python3.9/site-packages/anndata/_core/aligned_mapping.py:54: ExperimentalFeatureWarning: Support for Awkward Arrays is currently experimental. Behavior may change in the future. Please report any issues you may encounter!\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "adata_ir = ir.io.read_airr(input_dir / \"ireceptor-public-archive-rearrangement.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1135ba21-a56f-4c49-b1a0-f6f64f298a1e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "mdata = mudata.MuData(\n",
    "    {\"gex\": adata, \"airr\": adata_ir},\n",
    "    # store full metadata as json string\n",
    "    uns={\"metadata\": json.dumps(metadata)},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e08b3255-7e76-4001-991d-ed06acce9f78",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 125 ms, sys: 76.4 ms, total: 202 ms\n",
      "Wall time: 201 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "mdata.write_h5mu(output_dir / \"mdata.h5mu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8cdcff26-700b-49f8-a9b9-90f7b07a4c4b",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 122M\n",
      "-rw-r--r-- 1 sturm sturm 122M Jun 13 19:54 mdata.h5mu\n"
     ]
    }
   ],
   "source": [
    "!ls -lh {output_dir}"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:scirpy_dev]",
   "language": "python",
   "name": "conda-env-scirpy_dev-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "b1a72797-9348-4d50-bcd8-41fdb6e50e3a",
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"import json\n",
	"from itertools import islice\n",
	"from pathlib import Path\n",
	"\n",
	"import anndata\n",
	"import awkward as ak\n",
	"import ijson\n",
	"import mudata\n",
	"import pandas as pd\n",
	"import scirpy as ir\n",
	"from scipy.sparse import coo_matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "68dd0c76-05eb-43ab-8160-aee2fc5b6f39",
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"input_dir = Path(\"/home/sturm/Downloads/Yost-et-al-airr/\")\n",
	"output_dir = input_dir / \"out\"\n",
	"output_dir.mkdir(exist_ok=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "2f1076c4-e84b-423e-ae20-8f862d7f31e3",
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"total 4.6G\n",
	"-rw-r--r-- 1 sturm sturm 744 Apr 18 16:34 AIRR-manifest.json\n",
	"-rw-r--r-- 1 sturm sturm 378 Apr 18 16:34 info.txt\n",
	"-rw-r--r-- 1 sturm sturm 2.4M Apr 18 16:31 ireceptor-public-archive-cell.json\n",
	"-rw-r--r-- 1 sturm sturm 4.6G Apr 18 16:34 ireceptor-public-archive-gex.json\n",
	"-rw-r--r-- 1 sturm sturm 45K Apr 18 16:34 ireceptor-public-archive-metadata.json\n",
	"-rw-r--r-- 1 sturm sturm 21M Apr 18 16:34 ireceptor-public-archive-rearrangement.tsv\n",
	"drwxr-xr-x 2 sturm sturm 4.0K Jun 13 19:38 out\n"
	]
	}
	],
	"source": [
	"!ls -lh {input_dir}"
	]
	},
	{
	"cell_type": "markdown",
	"id": "49add0dc-61da-418e-8c18-9c14fc034bb0",
	"metadata": {},
	"source": [
	"Let's read in the following data: \n",
	" * ireceptor-public-archive-cell.json -> Cell-level metadata\n",
	" * ireceptor-public-archive-gex.json -> Gene Expression AnnData\n",
	" * ireceptor-public-archive-rearrangement.tsv -> AIRR AnnData\n",
	" * ireceptor-public-archive-metadata.json -> Sample-level metadata"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "93285834-b121-4ceb-aab9-a27b78916336",
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"def make_gex_anndata(objects, cell_info):\n",
	" var_symbol = {}\n",
	" var_index = {}\n",
	" obs_index = {}\n",
	"\n",
	" def _make_tuples():\n",
	" for o in objects:\n",
	" cell_id = o[\"cell_id\"]\n",
	" ensg = o[\"property\"][\"id\"]\n",
	" umi_count = o[\"value\"]\n",
	" try:\n",
	" obs_idx = obs_index[cell_id]\n",
	" except KeyError:\n",
	" obs_idx = obs_index[cell_id] = len(obs_index)\n",
	" try:\n",
	" var_idx = var_index[ensg]\n",
	" except KeyError:\n",
	" var_idx = var_index[ensg] = len(var_index)\n",
	" var_symbol[ensg] = o[\"property\"][\"label\"]\n",
	" yield umi_count, (obs_idx, var_idx)\n",
	"\n",
	" data, indices = zip(*_make_tuples())\n",
	" obs_indices, var_indices = zip(*indices)\n",
	" X = coo_matrix(\n",
	" (data, (obs_indices, var_indices)), shape=(len(obs_index), len(var_index))\n",
	" ).tocsr()\n",
	" var = pd.DataFrame({\"idx\": var_index, \"symbol\": var_symbol}).sort_values(\"idx\")\n",
	" var.index = var.index.map(lambda x: x.split(\":\")[1])\n",
	" obs = (\n",
	" pd.DataFrame({\"idx\": obs_index})\n",
	" .sort_values(\"idx\")\n",
	" .join(cell_info, how=\"left\", validate=\"one_to_one\")\n",
	" )\n",
	" adata = anndata.AnnData(X=X, obs=obs, var=var)\n",
	" adata.strings_to_categoricals()\n",
	" return adata"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "4244b9ff-5232-4d29-a315-f507489ec984",
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 17.7 ms, sys: 1.85 ms, total: 19.6 ms\n",
	"Wall time: 19.2 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"with open(input_dir / \"ireceptor-public-archive-cell.json\", \"rb\") as f:\n",
	" objects = ijson.items(f, \"Cell.item\")\n",
	" cell_info = pd.DataFrame(objects).set_index(\"cell_id\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "fd6336b4-625a-4131-a7b8-6491068d6c95",
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 259 µs, sys: 170 µs, total: 429 µs\n",
	"Wall time: 416 µs\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"with open(input_dir / \"ireceptor-public-archive-metadata.json\", \"rb\") as f:\n",
	" metadata = json.load(f)\n",
	" # metadata = ak.from_json(f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "32f907f9-a886-4178-80b9-e9914d100501",
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 29.5 s, sys: 864 ms, total: 30.3 s\n",
	"Wall time: 30.4 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"with open(input_dir / \"ireceptor-public-archive-gex.json\", \"rb\") as f:\n",
	" objects = ijson.items(f, \"CellExpression.item\")\n",
	" adata = make_gex_anndata(objects, cell_info)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "9493fa31-3945-4d15-b3cd-5faaf3fab0f0",
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 1.28 s, sys: 72.5 ms, total: 1.35 s\n",
	"Wall time: 1.33 s\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/sturm/anaconda3/envs/scirpy_dev/lib/python3.9/site-packages/anndata/_core/aligned_mapping.py:54: ExperimentalFeatureWarning: Support for Awkward Arrays is currently experimental. Behavior may change in the future. Please report any issues you may encounter!\n",
	" warnings.warn(\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"adata_ir = ir.io.read_airr(input_dir / \"ireceptor-public-archive-rearrangement.tsv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "1135ba21-a56f-4c49-b1a0-f6f64f298a1e",
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"mdata = mudata.MuData(\n",
	" {\"gex\": adata, \"airr\": adata_ir},\n",
	" # store full metadata as json string\n",
	" uns={\"metadata\": json.dumps(metadata)},\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "e08b3255-7e76-4001-991d-ed06acce9f78",
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 125 ms, sys: 76.4 ms, total: 202 ms\n",
	"Wall time: 201 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"mdata.write_h5mu(output_dir / \"mdata.h5mu\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "8cdcff26-700b-49f8-a9b9-90f7b07a4c4b",
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"total 122M\n",
	"-rw-r--r-- 1 sturm sturm 122M Jun 13 19:54 mdata.h5mu\n"
	]
	}
	],
	"source": [
	"!ls -lh {output_dir}"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [conda env:scirpy_dev]",
	"language": "python",
	"name": "conda-env-scirpy_dev-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}