Skip to content

Instantly share code, notes, and snippets.

@ivirshup
Created July 2, 2024 22:51
Show Gist options
  • Save ivirshup/8500d9a874ea9313ca87c0d5e46886e9 to your computer and use it in GitHub Desktop.
Save ivirshup/8500d9a874ea9313ca87c0d5e46886e9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Access census with dask"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import multiprocessing\n",
"\n",
"multiprocessing.set_start_method(\"spawn\")\n",
"\n",
"from pathlib import Path\n",
"\n",
"import tiledb\n",
"import cellxgene_census\n",
"from tiledbsoma import SOMATileDBContext\n",
"\n",
"from dask.array.core import normalize_chunks, slices_from_chunks\n",
"import dask.array as da\n",
"from dask import delayed\n",
"import dask.distributed as dd\n",
"\n",
"from scipy import sparse\n",
"import numpy as np\n",
"\n",
"import anndata as ad, scanpy as sc\n",
"import zarr"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"SPARSE_CHUNK_SIZE = 10_000\n",
"DENSE_CHUNK_SIZE = 5_000"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n",
"Perhaps you already have a cluster running?\n",
"Hosting the HTTP server on port 36497 instead\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/contextlib.py:144: UserWarning: Creating scratch directories is taking a surprisingly long time. (2.76s) This is often due to running workers on a network file system. Consider specifying a local-directory to point workers to write scratch data to a local disk.\n",
" next(self.gen)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-d0f0773b-38c2-11ef-8b21-023ca6c22285</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
"\n",
" <tr>\n",
" \n",
" <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
" <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n",
" \n",
" </tr>\n",
"\n",
" \n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:36497/status\" target=\"_blank\">http://127.0.0.1:36497/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" \n",
"\n",
" </table>\n",
"\n",
" \n",
"\n",
" \n",
" <details>\n",
" <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
" <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
" </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">a498bb4b</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:36497/status\" target=\"_blank\">http://127.0.0.1:36497/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Workers:</strong> 3\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads:</strong> 33\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total memory:</strong> 123.85 GiB\n",
" </td>\n",
" </tr>\n",
" \n",
" <tr>\n",
" <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
" <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
"</tr>\n",
"\n",
" \n",
" </table>\n",
"\n",
" <details>\n",
" <summary style=\"margin-bottom: 20px;\">\n",
" <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
" </summary>\n",
"\n",
" <div style=\"\">\n",
" <div>\n",
" <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-60c3290e-c8f5-4459-b96d-7202052144fc</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm:</strong> tcp://127.0.0.1:36791\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Workers:</strong> 3\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:36497/status\" target=\"_blank\">http://127.0.0.1:36497/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads:</strong> 33\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Started:</strong> Just now\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total memory:</strong> 123.85 GiB\n",
" </td>\n",
" </tr>\n",
" </table>\n",
" </div>\n",
" </div>\n",
"\n",
" <details style=\"margin-left: 48px;\">\n",
" <summary style=\"margin-bottom: 20px;\">\n",
" <h3 style=\"display: inline;\">Workers</h3>\n",
" </summary>\n",
"\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:35759\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 11\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:45939/status\" target=\"_blank\">http://127.0.0.1:45939/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 41.28 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:35515\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-k33soctb\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:33917\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 11\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:40619/status\" target=\"_blank\">http://127.0.0.1:40619/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 41.28 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:45185\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-rxeqpn9c\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:38625\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 11\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:46603/status\" target=\"_blank\">http://127.0.0.1:46603/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 41.28 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:41677\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-usajy86k\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
"\n",
" </details>\n",
"</div>\n",
"\n",
" </details>\n",
" </div>\n",
"</div>\n",
" </details>\n",
" \n",
"\n",
" </div>\n",
"</div>"
],
"text/plain": [
"<Client: 'tcp://127.0.0.1:36791' processes=3 threads=33, memory=123.85 GiB>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster = dd.LocalCluster(n_workers=3)\n",
"client = dd.Client(cluster)\n",
"client"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This code wraps the tiledb array in a dask array. I am creating a delayed function for each dask chunk, then reinterpreting that collection of delayed functions as a single dask array."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def to_listed_chunks(chunk_size: int, dim_size: int) -> list[int]:\n",
" n_full, rem = divmod(dim_size, chunk_size)\n",
" chunk_list = [chunk_size] * n_full\n",
" if rem:\n",
" chunk_list += [rem]\n",
" return chunk_list\n",
"\n",
"\n",
"def make_sparse_chunk(array: tiledb.Array, tile_slices: list[slice]) -> da.Array:\n",
" shape = [(s.stop - s.start) for s in tile_slices]\n",
" def _inner(array, tile_slices):\n",
" res = array[tile_slices]\n",
" offsets = [s.start for s in tile_slices]\n",
" res[\"soma_dim_0\"] -= offsets[0]\n",
" res[\"soma_dim_1\"] -= offsets[1]\n",
" return sparse.csr_matrix((res[\"soma_data\"], (res[\"soma_dim_0\"], res[\"soma_dim_1\"])), shape=shape)\n",
" # return sparse.csr_matrix((res[\"soma_data\"], (res[\"soma_dim_0\"] - offsets[0], res[\"soma_dim_1\"] - offsets[1])), shape=shape)\n",
" return da.from_delayed(delayed(_inner)(array, tile_slices), shape=shape, meta=sparse.csr_matrix((0, 0), dtype=tiledb_array.dtype))\n",
"\n",
"def tiledb_sparse_as_dask(tdb_array: tiledb.Array) -> da.Array:\n",
" schema = tdb_array.schema\n",
" # chunks = list(schema.domain.dim(i).tile for i in range(schema.ndim))\n",
" chunks = [SPARSE_CHUNK_SIZE, tdb_array.shape[1]]\n",
" # Simplifying to have complete slices across rows\n",
" slices: list[list[slice]] = slices_from_chunks((to_listed_chunks(chunks[0], tdb_array.shape[0]), [tdb_array.shape[1]]))\n",
" return da.concatenate(\n",
" [make_sparse_chunk(tdb_array, s) for s in slices], axis=0\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"SOMA_URI= \"s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma\"\n",
"SPECIES = \"mus_musculus\"\n",
"\n",
"# need to specify anonymous access\n",
"ctx = {\n",
" \"vfs.s3.no_sign_request\": \"true\",\n",
" \"vfs.s3.region\": \"us-west-2\"\n",
"}\n",
"\n",
"tiledb_array = tiledb.open(\n",
" f\"{SOMA_URI}/census_data/{SPECIES}/ms/RNA/X/raw/\",\n",
" ctx=tiledb.Ctx(ctx),\n",
")\n",
"\n",
"X = tiledb_sparse_as_dask(tiledb_array)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For the sake of time, I am going to slim this down to the first million cells"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"X = X[:1_000_000]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Retrieving obs and var as pandas dataframes\n",
"census = cellxgene_census.open_soma(uri=SOMA_URI, context=SOMATileDBContext(tiledb_config=ctx))\n",
"obs = cellxgene_census.get_obs(census, SPECIES, coords=slice(X.shape[0] - 1))\n",
"var = cellxgene_census.get_var(census, SPECIES)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/github/anndata/src/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n",
" warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n",
"/home/ubuntu/github/anndata/src/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n",
" warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 553 ms, sys: 102 ms, total: 655 ms\n",
"Wall time: 646 ms\n"
]
},
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 1000000 × 52417\n",
" obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'\n",
" var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"adata = ad.AnnData(X=X, obs=obs, var=var)\n",
"adata"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Writing out as zarr to compare compute times\n",
"if not Path(\"adata.zarr\").is_dir():\n",
" adata.write_zarr(\"adata.zarr\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Running preprocessing with zarr\n",
"\n",
"I am running this one first because it is faster and runs more reliably"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Using the helper function from https://github.com/scverse/anndata/pull/1469 at commit 717b997d0e33ddae066f72cc6495cdb64b88d175\n",
"from anndata._io.specs import read_elem_as_dask\n",
"from anndata.experimental import read_elem\n",
"import zarr"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"z = zarr.open(\"adata.zarr\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 750 ms, sys: 320 ms, total: 1.07 s\n",
"Wall time: 1.11 s\n"
]
},
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 1000000 × 52417\n",
" obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'\n",
" var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"adata_zarr = ad.AnnData(\n",
" X=read_elem_as_dask(z[\"X\"], dataset_kwargs={\"chunks\": (SPARSE_CHUNK_SIZE, z[\"X\"].attrs[\"shape\"][1])}),\n",
" obs=read_elem(z[\"obs\"]),\n",
" var=read_elem(z[\"var\"]),\n",
")\n",
"adata_zarr"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 15.5 ms, sys: 6.61 ms, total: 22.1 ms\n",
"Wall time: 20.2 ms\n"
]
}
],
"source": [
"%%time\n",
"adata_zarr.layers[\"counts\"] = adata_zarr.X.copy()\n",
"sc.pp.normalize_total(adata_zarr, target_sum=10_000)\n",
"sc.pp.log1p(adata_zarr)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 13.4 s, sys: 945 ms, total: 14.4 s\n",
"Wall time: 36.9 s\n"
]
}
],
"source": [
"%%time\n",
"sc.pp.highly_variable_genes(adata_zarr)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-07-02 22:34:10,867 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 33.03 GiB -- Worker memory limit: 41.28 GiB\n",
"2024-07-02 22:34:30,372 - distributed.worker.memory - WARNING - Worker is at 61% memory usage. Resuming worker. Process memory: 25.21 GiB -- Worker memory limit: 41.28 GiB\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 55.4 s, sys: 8 s, total: 1min 3s\n",
"Wall time: 2min 35s\n"
]
}
],
"source": [
"%%time\n",
"adata_zarr.layers[\"dense\"] = adata_zarr.X.rechunk((DENSE_CHUNK_SIZE, -1)).map_blocks(\n",
" lambda x: x.toarray(), dtype=adata_zarr.X.dtype, meta=np.array([])\n",
")\n",
"sc.pp.pca(adata_zarr, layer=\"dense\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 42.5 s, sys: 6.15 s, total: 48.7 s\n",
"Wall time: 1min 44s\n"
]
}
],
"source": [
"%%time\n",
"# dask-ml immediatley computes the variable embeddings, but not the observation ones\n",
"adata_zarr.obsm[\"X_pca\"] = adata_zarr.obsm[\"X_pca\"].compute()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6.92 s, sys: 786 ms, total: 7.71 s\n",
"Wall time: 7.1 s\n"
]
}
],
"source": [
"%%time\n",
"sc.pl.pca(adata_zarr)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Running with tiledb"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"adata.layers[\"counts\"] = adata.X.copy()\n",
"sc.pp.normalize_total(adata, target_sum=10_000)\n",
"sc.pp.log1p(adata)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3min 10s, sys: 5.47 s, total: 3min 15s\n",
"Wall time: 3min 48s\n"
]
}
],
"source": [
"%%time\n",
"sc.pp.highly_variable_genes(adata)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-07-02 22:42:19,087 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 29.90 GiB -- Worker memory limit: 41.28 GiB\n",
"2024-07-02 22:43:23,469 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 29.69 GiB -- Worker memory limit: 41.28 GiB\n",
"2024-07-02 22:43:31,000 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 25.34 GiB -- Worker memory limit: 41.28 GiB\n",
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n",
" warnings.warn(\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-07-02 22:49:08,601 - distributed.nanny.memory - WARNING - Worker tcp://127.0.0.1:33917 (pid=1903480) exceeded 95% memory budget. Restarting...\n",
"2024-07-02 22:49:13,548 - distributed.scheduler - WARNING - Removing worker 'tcp://127.0.0.1:33917' caused the cluster to lose already computed task(s), which will be recomputed elsewhere: {('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 14, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 72, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 61, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 146, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 135, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 170, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 89, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 128, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 115, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 38, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 153, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 164, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 89, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 179, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 78, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 165, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 176, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 4, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 62, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 196, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 147, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 134, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 107, 0), ('standard_normal-ac93b4f2e5193fa6ccb32d644c625de2', 0, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 72, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 127, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 193, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 13, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 79, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 147, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 153, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 85, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 89, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 164, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 120, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 85, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 126, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 113, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 62, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 128, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 196, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 106, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 11, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 192, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 165, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 119, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 190, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 185, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 13, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 79, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 60, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 128, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 126, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 115, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 134, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 68, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 9, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 64, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 75, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 127, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 147, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 11, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 85, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 88, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 77, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 107, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 105, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 111, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 120, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 43, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 164, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 129, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 113, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 179, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 168, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 7, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 106, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 60, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 126, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 115, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 192, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 9, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 75, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 64, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 146, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 135, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 84, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 88, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 88, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 77, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 37, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 7, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 84, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 73, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 112, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 105, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 165, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 65, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 120, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 199, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 153, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 129, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 178, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 12, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 127, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 193, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 8, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 76, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 74, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 63, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 12, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 146, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 135, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 190, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 10, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 84, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 65, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 73, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 199, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 14, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 170, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 119, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 112, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 178, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 61, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 185, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 8, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 74, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 63, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 134, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 129, 0), ('sub-75443a7e38afa768e3212e0543fc2179', 4, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 78, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-r1', 111, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 10, 0), ('getitem-4c3865ad76f5b918ea4bff00e9cc559f-q1', 76, 0)} (stimulus_id='handle-worker-cleanup-1719960553.5466065')\n",
"2024-07-02 22:49:13,980 - distributed.nanny - WARNING - Restarting worker\n",
"Task exception was never retrieved\n",
"future: <Task finished name='Task-403478' coro=<Client._gather.<locals>.wait() done, defined at /home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/client.py:2199> exception=AllExit()>\n",
"Traceback (most recent call last):\n",
" File \"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/client.py\", line 2208, in wait\n",
" raise AllExit()\n",
"distributed.client.AllExit\n",
"Task exception was never retrieved\n",
"future: <Task finished name='Task-403475' coro=<Client._gather.<locals>.wait() done, defined at /home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/client.py:2199> exception=AllExit()>\n",
"Traceback (most recent call last):\n",
" File \"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/client.py\", line 2208, in wait\n",
" raise AllExit()\n",
"distributed.client.AllExit\n",
"Task exception was never retrieved\n",
"future: <Task finished name='Task-403476' coro=<Client._gather.<locals>.wait() done, defined at /home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/client.py:2199> exception=AllExit()>\n",
"Traceback (most recent call last):\n",
" File \"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/client.py\", line 2208, in wait\n",
" raise AllExit()\n",
"distributed.client.AllExit\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m<timed exec>:4\u001b[0m\n",
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/scanpy/preprocessing/_pca.py:280\u001b[0m, in \u001b[0;36mpca\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 276\u001b[0m X \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mtoarray()\n\u001b[1;32m 277\u001b[0m pca_ \u001b[38;5;241m=\u001b[39m PCA(\n\u001b[1;32m 278\u001b[0m n_components\u001b[38;5;241m=\u001b[39mn_comps, svd_solver\u001b[38;5;241m=\u001b[39msvd_solver, random_state\u001b[38;5;241m=\u001b[39mrandom_state\n\u001b[1;32m 279\u001b[0m )\n\u001b[0;32m--> 280\u001b[0m X_pca \u001b[38;5;241m=\u001b[39m \u001b[43mpca_\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m issparse(X) \u001b[38;5;129;01mand\u001b[39;00m zero_center:\n\u001b[1;32m 282\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdecomposition\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PCA\n",
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/sklearn/utils/_set_output.py:313\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[0;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 313\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 314\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m 315\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[1;32m 316\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 317\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[1;32m 318\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[1;32m 319\u001b[0m )\n",
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/dask_ml/decomposition/pca.py:402\u001b[0m, in \u001b[0;36mPCA.fit_transform\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 400\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m dask\u001b[38;5;241m.\u001b[39mis_dask_collection(X):\n\u001b[1;32m 401\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(_TYPE_MSG\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;28mtype\u001b[39m(X)))\n\u001b[0;32m--> 402\u001b[0m U, S, V \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 403\u001b[0m U \u001b[38;5;241m=\u001b[39m U[:, : \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_components_]\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwhiten:\n\u001b[1;32m 406\u001b[0m \u001b[38;5;66;03m# X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)\u001b[39;00m\n",
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/dask_ml/decomposition/pca.py:298\u001b[0m, in \u001b[0;36mPCA._fit\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 289\u001b[0m total_variance \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mnan\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 292\u001b[0m (\n\u001b[1;32m 293\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_samples_, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_features_in_),\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_components_,\n\u001b[1;32m 295\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcomponents_,\n\u001b[1;32m 296\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msingular_values_,\n\u001b[1;32m 297\u001b[0m total_variance,\n\u001b[0;32m--> 298\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 300\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_components\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 301\u001b[0m \u001b[43m \u001b[49m\u001b[43mcomponents\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 302\u001b[0m \u001b[43m \u001b[49m\u001b[43msingular_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 303\u001b[0m \u001b[43m \u001b[49m\u001b[43mtotal_variance\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 304\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39misnan(X\u001b[38;5;241m.\u001b[39mshape)\u001b[38;5;241m.\u001b[39many():\n",
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/dask/base.py:662\u001b[0m, in \u001b[0;36mcompute\u001b[0;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[0m\n\u001b[1;32m 659\u001b[0m postcomputes\u001b[38;5;241m.\u001b[39mappend(x\u001b[38;5;241m.\u001b[39m__dask_postcompute__())\n\u001b[1;32m 661\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m shorten_traceback():\n\u001b[0;32m--> 662\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdsk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 664\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m repack([f(r, \u001b[38;5;241m*\u001b[39ma) \u001b[38;5;28;01mfor\u001b[39;00m r, (f, a) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(results, postcomputes)])\n",
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/threading.py:629\u001b[0m, in \u001b[0;36mEvent.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 627\u001b[0m signaled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_flag\n\u001b[1;32m 628\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m signaled:\n\u001b[0;32m--> 629\u001b[0m signaled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cond\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 630\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m signaled\n",
"File \u001b[0;32m~/miniforge3/envs/cellxgene-census-dev/lib/python3.11/threading.py:331\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 331\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 333\u001b[0m gotit \u001b[38;5;241m=\u001b[39m waiter\u001b[38;5;241m.\u001b[39macquire(\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-07-02 22:49:32,100 - distributed.worker.memory - WARNING - gc.collect() took 1.022s. This is usually a sign that some tasks handle too many Python objects at the same time. Rechunking the work into smaller tasks might help.\n",
"2024-07-02 22:49:32,101 - distributed.worker.memory - WARNING - Worker is at 82% memory usage. Pausing worker. Process memory: 33.97 GiB -- Worker memory limit: 41.28 GiB\n",
"2024-07-02 22:49:41,430 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 33.97 GiB -- Worker memory limit: 41.28 GiB\n",
"2024-07-02 22:49:43,591 - distributed.worker.memory - WARNING - Worker is at 1% memory usage. Resuming worker. Process memory: 757.55 MiB -- Worker memory limit: 41.28 GiB\n"
]
}
],
"source": [
"%%time\n",
"adata.layers[\"dense\"] = adata.X.rechunk((DENSE_CHUNK_SIZE, -1)).map_blocks(\n",
" lambda x: x.toarray(), dtype=adata.X.dtype, meta=np.array([])\n",
")\n",
"sc.pp.pca(adata, layer=\"dense\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "cellxgene-census-dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment