Created
November 2, 2023 14:06
-
-
Save ivirshup/c29c9fb0b5b21a9c290cf621e4e68b18 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# AnnData 0.10 OOC demo" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For Theislab meeting 2023-11-02" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%load_ext memory_profiler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pathlib import Path\n", | |
"import warnings\n", | |
"\n", | |
"import h5py\n", | |
"from scipy import sparse\n", | |
"\n", | |
"import anndata as ad\n", | |
"from anndata.experimental import read_elem, sparse_dataset\n", | |
"\n", | |
"warnings.filterwarnings('ignore', category=UserWarning)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"DATA_DIR = Path(\"/mnt/workspace/data/no_raw/\")\n", | |
"PTHS = [DATA_DIR / p for p in [\"cd19-carT-atlas-164k.h5ad\", \"cd19-carT-atlas-185k.h5ad\", \"cd19-carT-atlas-417k.h5ad\"]]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## `concat_on_disk`" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Combining anndata on disk with a lower memory overhead:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"peak memory: 5048.45 MiB, increment: 4755.60 MiB\n" | |
] | |
} | |
], | |
"source": [ | |
"%%memit\n", | |
"ad.experimental.concat_on_disk(PTHS, \"combined.h5ad\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"17G\tcombined.h5ad\n" | |
] | |
} | |
], | |
"source": [ | |
"!du -hs combined.h5ad" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## New backed interface" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"* No longer tied to a single file\n", | |
"* Can work with backed sparse matrices directly more easily" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Backed with `sparse_dataset`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_backed(group):\n", | |
" return ad.AnnData(\n", | |
" sparse_dataset(group[\"X\"]),\n", | |
" **{\n", | |
" k: read_elem(group[k]) if k in group else {}\n", | |
" for k in [\"layers\", \"obs\", \"var\", \"obsm\", \"varm\", \"uns\", \"obsp\", \"varp\"]\n", | |
" }\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 952 ms, sys: 112 ms, total: 1.06 s\n", | |
"Wall time: 1.06 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"backed_adata = read_backed(h5py.File(\"combined.h5ad\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"AnnData object with n_obs × n_vars = 766507 × 40145\n", | |
" obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n", | |
" obsm: 'X_umap'" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"backed_adata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 3.82 s, sys: 1.86 s, total: 5.68 s\n", | |
"Wall time: 5.68 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"AnnData object with n_obs × n_vars = 176221 × 40145\n", | |
" obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n", | |
" obsm: 'X_umap'" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"subset = backed_adata[\n", | |
" (backed_adata.obs[\"self_reported_ethnicity\"] == \"European\") & \\\n", | |
" (backed_adata.obs[\"sex\"] == \"female\")\n", | |
"].copy()\n", | |
"subset" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Sparse `dask` chunks\n", | |
"\n", | |
"**NOTE** Dask's support for sparse chunks isn't great" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import dask.array as da\n", | |
"from dask import delayed\n", | |
"import zarr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def csr_callable(shape: tuple[int, int], dtype) -> sparse.csr_matrix:\n", | |
" if len(shape) == 0:\n", | |
" shape = (0, 0)\n", | |
" if len(shape) == 1:\n", | |
" shape = (shape[0], 0)\n", | |
" elif len(shape) == 2:\n", | |
" pass\n", | |
" else:\n", | |
" raise ValueError(shape)\n", | |
"\n", | |
" return sparse.csr_matrix(shape, dtype=dtype)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class CSRCallable:\n", | |
" \"\"\"Dummy class to bypass dask checks\"\"\"\n", | |
" def __new__(cls, shape, dtype):\n", | |
" return csr_callable(shape, dtype)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def make_dask_chunk(x: \"SparseDataset\", start: int, end: int) -> da.Array:\n", | |
" def take_slice(x, idx):\n", | |
" return x[idx]\n", | |
"\n", | |
" return da.from_delayed(\n", | |
" delayed(take_slice)(x, slice(start, end)),\n", | |
" dtype=x.dtype,\n", | |
" shape=(end - start, x.shape[1]),\n", | |
" meta=CSRCallable,\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def sparse_dataset_as_dask(x, stride: int):\n", | |
" n_chunks, rem = divmod(x.shape[0], stride)\n", | |
"\n", | |
" chunks = []\n", | |
" cur_pos = 0\n", | |
" for i in range(n_chunks):\n", | |
" chunks.append(make_dask_chunk(x, cur_pos, cur_pos + stride))\n", | |
" cur_pos += stride\n", | |
" if rem:\n", | |
" chunks.append(make_dask_chunk(x, cur_pos, x.shape[0]))\n", | |
"\n", | |
" return da.concatenate(chunks, axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_w_sparse_dask(group: h5py.Group | zarr.Group, obs_chunk: int = 1000) -> ad.AnnData:\n", | |
" return ad.AnnData(\n", | |
" X=sparse_dataset_as_dask(sparse_dataset(group[\"X\"]), obs_chunk),\n", | |
" **{\n", | |
" k: read_elem(group[k]) if k in group else {}\n", | |
" for k in [\"layers\", \"obs\", \"var\", \"obsm\", \"varm\", \"uns\", \"obsp\", \"varp\"]\n", | |
" }\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.13 s, sys: 48.6 ms, total: 1.18 s\n", | |
"Wall time: 1.17 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"dask_adatas = [read_w_sparse_dask(h5py.File(p), 10_000) for p in PTHS]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 201 ms, sys: 6.46 ms, total: 207 ms\n", | |
"Wall time: 208 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"combined = ad.concat(dask_adatas)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"AnnData object with n_obs × n_vars = 766507 × 40145\n", | |
" obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n", | |
" obsm: 'X_umap'" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"combined" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <td>\n", | |
" <table style=\"border-collapse: collapse;\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <td> </td>\n", | |
" <th> Array </th>\n", | |
" <th> Chunk </th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" <th> Shape </th>\n", | |
" <td> (766507, 40145) </td>\n", | |
" <td> (10000, 40145) </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Dask graph </th>\n", | |
" <td colspan=\"2\"> 78 chunks in 160 graph layers </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Data type </th>\n", | |
" <td colspan=\"2\"> float32 scipy.sparse._csr.csr_matrix </td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
" </table>\n", | |
" </td>\n", | |
" <td>\n", | |
" <svg width=\"84\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
"\n", | |
" <!-- Horizontal lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"34\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"0\" y1=\"6\" x2=\"34\" y2=\"6\" />\n", | |
" <line x1=\"0\" y1=\"12\" x2=\"34\" y2=\"12\" />\n", | |
" <line x1=\"0\" y1=\"18\" x2=\"34\" y2=\"18\" />\n", | |
" <line x1=\"0\" y1=\"25\" x2=\"34\" y2=\"25\" />\n", | |
" <line x1=\"0\" y1=\"30\" x2=\"34\" y2=\"30\" />\n", | |
" <line x1=\"0\" y1=\"36\" x2=\"34\" y2=\"36\" />\n", | |
" <line x1=\"0\" y1=\"42\" x2=\"34\" y2=\"42\" />\n", | |
" <line x1=\"0\" y1=\"49\" x2=\"34\" y2=\"49\" />\n", | |
" <line x1=\"0\" y1=\"54\" x2=\"34\" y2=\"54\" />\n", | |
" <line x1=\"0\" y1=\"62\" x2=\"34\" y2=\"62\" />\n", | |
" <line x1=\"0\" y1=\"68\" x2=\"34\" y2=\"68\" />\n", | |
" <line x1=\"0\" y1=\"75\" x2=\"34\" y2=\"75\" />\n", | |
" <line x1=\"0\" y1=\"81\" x2=\"34\" y2=\"81\" />\n", | |
" <line x1=\"0\" y1=\"87\" x2=\"34\" y2=\"87\" />\n", | |
" <line x1=\"0\" y1=\"93\" x2=\"34\" y2=\"93\" />\n", | |
" <line x1=\"0\" y1=\"100\" x2=\"34\" y2=\"100\" />\n", | |
" <line x1=\"0\" y1=\"106\" x2=\"34\" y2=\"106\" />\n", | |
" <line x1=\"0\" y1=\"112\" x2=\"34\" y2=\"112\" />\n", | |
" <line x1=\"0\" y1=\"120\" x2=\"34\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Vertical lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"34\" y1=\"0\" x2=\"34\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Colored Rectangle -->\n", | |
" <polygon points=\"0.0,0.0 34.78482742892137,0.0 34.78482742892137,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n", | |
"\n", | |
" <!-- Text -->\n", | |
" <text x=\"17.392414\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >40145</text>\n", | |
" <text x=\"54.784827\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,54.784827,60.000000)\">766507</text>\n", | |
"</svg>\n", | |
" </td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"dask.array<concatenate, shape=(766507, 40145), dtype=float32, chunksize=(10000, 40145), chunktype=scipy.csr_matrix>" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"combined.X" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 151 ms, sys: 0 ns, total: 151 ms\n", | |
"Wall time: 152 ms\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/mnt/workspace/mambaforge/envs/anndata-dev/lib/python3.11/site-packages/anndata/_core/index.py:158: PerformanceWarning: Slicing is producing a large chunk. To accept the large\n", | |
"chunk and silence this warning, set the option\n", | |
" >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n", | |
" ... array[indexer]\n", | |
"\n", | |
"To avoid creating the large chunks, set the option\n", | |
" >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", | |
" ... array[indexer]\n", | |
" return a[subset_idx]\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"AnnData object with n_obs × n_vars = 176221 × 40145\n", | |
" obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n", | |
" obsm: 'X_umap'" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"subset = combined[\n", | |
" (combined.obs[\"self_reported_ethnicity\"] == \"European\") & \\\n", | |
" (combined.obs[\"sex\"] == \"female\")\n", | |
"].copy()\n", | |
"\n", | |
"subset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <td>\n", | |
" <table style=\"border-collapse: collapse;\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <td> </td>\n", | |
" <th> Array </th>\n", | |
" <th> Chunk </th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" <th> Shape </th>\n", | |
" <td> (176221, 40145) </td>\n", | |
" <td> (10000, 40145) </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Dask graph </th>\n", | |
" <td colspan=\"2\"> 43 chunks in 161 graph layers </td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th> Data type </th>\n", | |
" <td colspan=\"2\"> float32 scipy.sparse._csr.csr_matrix </td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
" </table>\n", | |
" </td>\n", | |
" <td>\n", | |
" <svg width=\"92\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
"\n", | |
" <!-- Horizontal lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"42\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"0\" y1=\"6\" x2=\"42\" y2=\"6\" />\n", | |
" <line x1=\"0\" y1=\"12\" x2=\"42\" y2=\"12\" />\n", | |
" <line x1=\"0\" y1=\"13\" x2=\"42\" y2=\"13\" />\n", | |
" <line x1=\"0\" y1=\"19\" x2=\"42\" y2=\"19\" />\n", | |
" <line x1=\"0\" y1=\"23\" x2=\"42\" y2=\"23\" />\n", | |
" <line x1=\"0\" y1=\"26\" x2=\"42\" y2=\"26\" />\n", | |
" <line x1=\"0\" y1=\"33\" x2=\"42\" y2=\"33\" />\n", | |
" <line x1=\"0\" y1=\"43\" x2=\"42\" y2=\"43\" />\n", | |
" <line x1=\"0\" y1=\"44\" x2=\"42\" y2=\"44\" />\n", | |
" <line x1=\"0\" y1=\"49\" x2=\"42\" y2=\"49\" />\n", | |
" <line x1=\"0\" y1=\"54\" x2=\"42\" y2=\"54\" />\n", | |
" <line x1=\"0\" y1=\"64\" x2=\"42\" y2=\"64\" />\n", | |
" <line x1=\"0\" y1=\"70\" x2=\"42\" y2=\"70\" />\n", | |
" <line x1=\"0\" y1=\"81\" x2=\"42\" y2=\"81\" />\n", | |
" <line x1=\"0\" y1=\"86\" x2=\"42\" y2=\"86\" />\n", | |
" <line x1=\"0\" y1=\"99\" x2=\"42\" y2=\"99\" />\n", | |
" <line x1=\"0\" y1=\"105\" x2=\"42\" y2=\"105\" />\n", | |
" <line x1=\"0\" y1=\"110\" x2=\"42\" y2=\"110\" />\n", | |
" <line x1=\"0\" y1=\"120\" x2=\"42\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Vertical lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"42\" y1=\"0\" x2=\"42\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Colored Rectangle -->\n", | |
" <polygon points=\"0.0,0.0 42.54172029869016,0.0 42.54172029869016,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n", | |
"\n", | |
" <!-- Text -->\n", | |
" <text x=\"21.270860\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >40145</text>\n", | |
" <text x=\"62.541720\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,62.541720,60.000000)\">176221</text>\n", | |
"</svg>\n", | |
" </td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"dask.array<getitem, shape=(176221, 40145), dtype=float32, chunksize=(10000, 40145), chunktype=scipy.csr_matrix>" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"subset.X" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 9.55 s, sys: 7.77 s, total: 17.3 s\n", | |
"Wall time: 13.4 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"subset.write_zarr(\"analysis_subset.zarr\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"And the new form of backed works with `Zarr`:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"backed_from_zarr = read_backed(zarr.open(\"analysis_subset.zarr\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"AnnData object with n_obs × n_vars = 176221 × 40145\n", | |
" obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n", | |
" obsm: 'X_umap'" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"backed_from_zarr" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Is this better?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### In memory" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"peak memory: 6226.51 MiB, increment: 0.10 MiB\n" | |
] | |
} | |
], | |
"source": [ | |
"%memit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"peak memory: 22770.68 MiB, increment: 16544.11 MiB\n" | |
] | |
} | |
], | |
"source": [ | |
"%%memit\n", | |
"mem_adatas = [ad.read_h5ad(p) for p in PTHS]\n", | |
"mem_adata = mem_adatas[-1]\n", | |
"mem_adata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"peak memory: 47432.70 MiB, increment: 24662.02 MiB\n" | |
] | |
} | |
], | |
"source": [ | |
"%%memit\n", | |
"mem_combined = ad.concat(mem_adatas)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"peak memory: 54683.82 MiB, increment: 7251.12 MiB\n" | |
] | |
} | |
], | |
"source": [ | |
"%%memit\n", | |
"subset = mem_combined[\n", | |
" (mem_combined.obs[\"self_reported_ethnicity\"] == \"European\") & \\\n", | |
" (mem_combined.obs[\"sex\"] == \"female\")\n", | |
"].copy()\n", | |
"subset" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This takes about ~1 min in total to load combine and subset. When doing it with dask it took about 12 seconds to do that + write out the results." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Backed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.05 s, sys: 174 ms, total: 1.22 s\n", | |
"Wall time: 1.4 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"backed_adatas = [read_backed(h5py.File(p)) for p in PTHS]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"AnnData object with n_obs × n_vars = 417167 × 40145\n", | |
" obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'knnCD3', 'knnCD8', 'knnCD4', 'knnIAC', 'cell_compartment', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n", | |
" var: 'highly_variable', 'means', 'variances', 'variances_norm', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'\n", | |
" uns: 'CAR_status_colors', 'cell_compartment_colors', 'cell_cycle_phase_colors', 'response3m_colors', 'schema_version', 'title', 'umap'\n", | |
" obsm: 'X_umap'" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"backed_adata = backed_adatas[-1]\n", | |
"backed_adata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<96260x40145 sparse matrix of type '<class 'numpy.float32'>'\n", | |
"\twith 254787323 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"backed_adata[\n", | |
" (backed_adata.obs[\"self_reported_ethnicity\"] == \"European\") & \\\n", | |
" (backed_adata.obs[\"sex\"] == \"female\")\n", | |
"].X.copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"peak memory: 53142.22 MiB, increment: 0.05 MiB\n" | |
] | |
} | |
], | |
"source": [ | |
"%memit" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What to look out for:" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"* More backed improvements\n", | |
" * Backed dataframes\n", | |
" * Better chunking schemes for backed sparse datasets\n", | |
"* Using these features downstream\n", | |
" * E.g. out-of-core `scanpy` + `rapids-singlecell` " | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "anndata-dev", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment