ivirshup/anndata-0-10-demo-theislab.ipynb

## anndata-0-10-demo-theislab.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AnnData 0.10 OOC demo"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For Theislab meeting 2023-11-02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext memory_profiler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import warnings\n",
    "\n",
    "import h5py\n",
    "from scipy import sparse\n",
    "\n",
    "import anndata as ad\n",
    "from anndata.experimental import read_elem, sparse_dataset\n",
    "\n",
    "warnings.filterwarnings('ignore', category=UserWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR = Path(\"/mnt/workspace/data/no_raw/\")\n",
    "PTHS = [DATA_DIR / p for p in [\"cd19-carT-atlas-164k.h5ad\", \"cd19-carT-atlas-185k.h5ad\", \"cd19-carT-atlas-417k.h5ad\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `concat_on_disk`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Combining anndata on disk with a lower memory overhead:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "peak memory: 5048.45 MiB, increment: 4755.60 MiB\n"
     ]
    }
   ],
   "source": [
    "%%memit\n",
    "ad.experimental.concat_on_disk(PTHS, \"combined.h5ad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17G\tcombined.h5ad\n"
     ]
    }
   ],
   "source": [
    "!du -hs combined.h5ad"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## New backed interface"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* No longer tied to a single file\n",
    "* Can work with backed sparse matrices directly more easily"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Backed with `sparse_dataset`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_backed(group):\n",
    "    return ad.AnnData(\n",
    "        sparse_dataset(group[\"X\"]),\n",
    "        **{\n",
    "            k: read_elem(group[k]) if k in group else {}\n",
    "            for k in [\"layers\", \"obs\", \"var\", \"obsm\", \"varm\", \"uns\", \"obsp\", \"varp\"]\n",
    "        }\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 952 ms, sys: 112 ms, total: 1.06 s\n",
      "Wall time: 1.06 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "backed_adata = read_backed(h5py.File(\"combined.h5ad\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 766507 × 40145\n",
       "    obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n",
       "    obsm: 'X_umap'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "backed_adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.82 s, sys: 1.86 s, total: 5.68 s\n",
      "Wall time: 5.68 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 176221 × 40145\n",
       "    obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n",
       "    obsm: 'X_umap'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "subset = backed_adata[\n",
    "    (backed_adata.obs[\"self_reported_ethnicity\"] == \"European\") & \\\n",
    "    (backed_adata.obs[\"sex\"] == \"female\")\n",
    "].copy()\n",
    "subset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sparse `dask` chunks\n",
    "\n",
    "**NOTE** Dask's support for sparse chunks isn't great"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import dask.array as da\n",
    "from dask import delayed\n",
    "import zarr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def csr_callable(shape: tuple[int, int], dtype) -> sparse.csr_matrix:\n",
    "    if len(shape) == 0:\n",
    "        shape = (0, 0)\n",
    "    if len(shape) == 1:\n",
    "        shape = (shape[0], 0)\n",
    "    elif len(shape) == 2:\n",
    "        pass\n",
    "    else:\n",
    "        raise ValueError(shape)\n",
    "\n",
    "    return sparse.csr_matrix(shape, dtype=dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CSRCallable:\n",
    "    \"\"\"Dummy class to bypass dask checks\"\"\"\n",
    "    def __new__(cls, shape, dtype):\n",
    "        return csr_callable(shape, dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_dask_chunk(x: \"SparseDataset\", start: int, end: int) -> da.Array:\n",
    "    def take_slice(x, idx):\n",
    "        return x[idx]\n",
    "\n",
    "    return da.from_delayed(\n",
    "        delayed(take_slice)(x, slice(start, end)),\n",
    "        dtype=x.dtype,\n",
    "        shape=(end - start, x.shape[1]),\n",
    "        meta=CSRCallable,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sparse_dataset_as_dask(x, stride: int):\n",
    "    n_chunks, rem = divmod(x.shape[0], stride)\n",
    "\n",
    "    chunks = []\n",
    "    cur_pos = 0\n",
    "    for i in range(n_chunks):\n",
    "        chunks.append(make_dask_chunk(x, cur_pos, cur_pos + stride))\n",
    "        cur_pos += stride\n",
    "    if rem:\n",
    "        chunks.append(make_dask_chunk(x, cur_pos, x.shape[0]))\n",
    "\n",
    "    return da.concatenate(chunks, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_w_sparse_dask(group: h5py.Group | zarr.Group, obs_chunk: int = 1000) -> ad.AnnData:\n",
    "    return ad.AnnData(\n",
    "        X=sparse_dataset_as_dask(sparse_dataset(group[\"X\"]), obs_chunk),\n",
    "        **{\n",
    "            k: read_elem(group[k]) if k in group else {}\n",
    "            for k in [\"layers\", \"obs\", \"var\", \"obsm\", \"varm\", \"uns\", \"obsp\", \"varp\"]\n",
    "        }\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.13 s, sys: 48.6 ms, total: 1.18 s\n",
      "Wall time: 1.17 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "dask_adatas = [read_w_sparse_dask(h5py.File(p), 10_000) for p in PTHS]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 201 ms, sys: 6.46 ms, total: 207 ms\n",
      "Wall time: 208 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "combined = ad.concat(dask_adatas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 766507 × 40145\n",
       "    obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n",
       "    obsm: 'X_umap'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (766507, 40145) </td>\n",
       "                        <td> (10000, 40145) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 78 chunks in 160 graph layers </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float32 scipy.sparse._csr.csr_matrix </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"84\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"34\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"6\" x2=\"34\" y2=\"6\" />\n",
       "  <line x1=\"0\" y1=\"12\" x2=\"34\" y2=\"12\" />\n",
       "  <line x1=\"0\" y1=\"18\" x2=\"34\" y2=\"18\" />\n",
       "  <line x1=\"0\" y1=\"25\" x2=\"34\" y2=\"25\" />\n",
       "  <line x1=\"0\" y1=\"30\" x2=\"34\" y2=\"30\" />\n",
       "  <line x1=\"0\" y1=\"36\" x2=\"34\" y2=\"36\" />\n",
       "  <line x1=\"0\" y1=\"42\" x2=\"34\" y2=\"42\" />\n",
       "  <line x1=\"0\" y1=\"49\" x2=\"34\" y2=\"49\" />\n",
       "  <line x1=\"0\" y1=\"54\" x2=\"34\" y2=\"54\" />\n",
       "  <line x1=\"0\" y1=\"62\" x2=\"34\" y2=\"62\" />\n",
       "  <line x1=\"0\" y1=\"68\" x2=\"34\" y2=\"68\" />\n",
       "  <line x1=\"0\" y1=\"75\" x2=\"34\" y2=\"75\" />\n",
       "  <line x1=\"0\" y1=\"81\" x2=\"34\" y2=\"81\" />\n",
       "  <line x1=\"0\" y1=\"87\" x2=\"34\" y2=\"87\" />\n",
       "  <line x1=\"0\" y1=\"93\" x2=\"34\" y2=\"93\" />\n",
       "  <line x1=\"0\" y1=\"100\" x2=\"34\" y2=\"100\" />\n",
       "  <line x1=\"0\" y1=\"106\" x2=\"34\" y2=\"106\" />\n",
       "  <line x1=\"0\" y1=\"112\" x2=\"34\" y2=\"112\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"34\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"34\" y1=\"0\" x2=\"34\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 34.78482742892137,0.0 34.78482742892137,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"17.392414\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >40145</text>\n",
       "  <text x=\"54.784827\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,54.784827,60.000000)\">766507</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<concatenate, shape=(766507, 40145), dtype=float32, chunksize=(10000, 40145), chunktype=scipy.csr_matrix>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 151 ms, sys: 0 ns, total: 151 ms\n",
      "Wall time: 152 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/workspace/mambaforge/envs/anndata-dev/lib/python3.11/site-packages/anndata/_core/index.py:158: PerformanceWarning: Slicing is producing a large chunk. To accept the large\n",
      "chunk and silence this warning, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n",
      "    ...     array[indexer]\n",
      "\n",
      "To avoid creating the large chunks, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n",
      "    ...     array[indexer]\n",
      "  return a[subset_idx]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 176221 × 40145\n",
       "    obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n",
       "    obsm: 'X_umap'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "subset = combined[\n",
    "    (combined.obs[\"self_reported_ethnicity\"] == \"European\") & \\\n",
    "    (combined.obs[\"sex\"] == \"female\")\n",
    "].copy()\n",
    "\n",
    "subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (176221, 40145) </td>\n",
       "                        <td> (10000, 40145) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 43 chunks in 161 graph layers </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float32 scipy.sparse._csr.csr_matrix </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"92\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"42\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"6\" x2=\"42\" y2=\"6\" />\n",
       "  <line x1=\"0\" y1=\"12\" x2=\"42\" y2=\"12\" />\n",
       "  <line x1=\"0\" y1=\"13\" x2=\"42\" y2=\"13\" />\n",
       "  <line x1=\"0\" y1=\"19\" x2=\"42\" y2=\"19\" />\n",
       "  <line x1=\"0\" y1=\"23\" x2=\"42\" y2=\"23\" />\n",
       "  <line x1=\"0\" y1=\"26\" x2=\"42\" y2=\"26\" />\n",
       "  <line x1=\"0\" y1=\"33\" x2=\"42\" y2=\"33\" />\n",
       "  <line x1=\"0\" y1=\"43\" x2=\"42\" y2=\"43\" />\n",
       "  <line x1=\"0\" y1=\"44\" x2=\"42\" y2=\"44\" />\n",
       "  <line x1=\"0\" y1=\"49\" x2=\"42\" y2=\"49\" />\n",
       "  <line x1=\"0\" y1=\"54\" x2=\"42\" y2=\"54\" />\n",
       "  <line x1=\"0\" y1=\"64\" x2=\"42\" y2=\"64\" />\n",
       "  <line x1=\"0\" y1=\"70\" x2=\"42\" y2=\"70\" />\n",
       "  <line x1=\"0\" y1=\"81\" x2=\"42\" y2=\"81\" />\n",
       "  <line x1=\"0\" y1=\"86\" x2=\"42\" y2=\"86\" />\n",
       "  <line x1=\"0\" y1=\"99\" x2=\"42\" y2=\"99\" />\n",
       "  <line x1=\"0\" y1=\"105\" x2=\"42\" y2=\"105\" />\n",
       "  <line x1=\"0\" y1=\"110\" x2=\"42\" y2=\"110\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"42\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"42\" y1=\"0\" x2=\"42\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 42.54172029869016,0.0 42.54172029869016,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"21.270860\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >40145</text>\n",
       "  <text x=\"62.541720\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,62.541720,60.000000)\">176221</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<getitem, shape=(176221, 40145), dtype=float32, chunksize=(10000, 40145), chunktype=scipy.csr_matrix>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.55 s, sys: 7.77 s, total: 17.3 s\n",
      "Wall time: 13.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "subset.write_zarr(\"analysis_subset.zarr\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And the new form of backed works with `Zarr`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "backed_from_zarr = read_backed(zarr.open(\"analysis_subset.zarr\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 176221 × 40145\n",
       "    obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n",
       "    obsm: 'X_umap'"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "backed_from_zarr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Is this better?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### In memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "peak memory: 6226.51 MiB, increment: 0.10 MiB\n"
     ]
    }
   ],
   "source": [
    "%memit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "peak memory: 22770.68 MiB, increment: 16544.11 MiB\n"
     ]
    }
   ],
   "source": [
    "%%memit\n",
    "mem_adatas = [ad.read_h5ad(p) for p in PTHS]\n",
    "mem_adata = mem_adatas[-1]\n",
    "mem_adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "peak memory: 47432.70 MiB, increment: 24662.02 MiB\n"
     ]
    }
   ],
   "source": [
    "%%memit\n",
    "mem_combined = ad.concat(mem_adatas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "peak memory: 54683.82 MiB, increment: 7251.12 MiB\n"
     ]
    }
   ],
   "source": [
    "%%memit\n",
    "subset = mem_combined[\n",
    "    (mem_combined.obs[\"self_reported_ethnicity\"] == \"European\") & \\\n",
    "    (mem_combined.obs[\"sex\"] == \"female\")\n",
    "].copy()\n",
    "subset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This takes about ~1 min in total to load combine and subset. When doing it with dask it took about 12 seconds to do that + write out the results."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Backed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.05 s, sys: 174 ms, total: 1.22 s\n",
      "Wall time: 1.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "backed_adatas = [read_backed(h5py.File(p)) for p in PTHS]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 417167 × 40145\n",
       "    obs: 'S_score', 'G2M_score', 'cell_cycle_phase', 'sample_id', 'knnCD3', 'knnCD8', 'knnCD4', 'knnIAC', 'cell_compartment', 'CAR_expression', 'CAR_status', 'CRS max grade', '3mo PET/CT', 'ICANS group', 'prolonged cytopenia', 'response3m', 'organism_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'\n",
       "    var: 'highly_variable', 'means', 'variances', 'variances_norm', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'\n",
       "    uns: 'CAR_status_colors', 'cell_compartment_colors', 'cell_cycle_phase_colors', 'response3m_colors', 'schema_version', 'title', 'umap'\n",
       "    obsm: 'X_umap'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "backed_adata = backed_adatas[-1]\n",
    "backed_adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<96260x40145 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 254787323 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "backed_adata[\n",
    "    (backed_adata.obs[\"self_reported_ethnicity\"] == \"European\") & \\\n",
    "    (backed_adata.obs[\"sex\"] == \"female\")\n",
    "].X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "peak memory: 53142.22 MiB, increment: 0.05 MiB\n"
     ]
    }
   ],
   "source": [
    "%memit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# What to look out for:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* More backed improvements\n",
    "    * Backed dataframes\n",
    "    * Better chunking schemes for backed sparse datasets\n",
    "* Using these features downstream\n",
    "    * E.g. out-of-core `scanpy` + `rapids-singlecell` "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "anndata-dev",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}