Intron7/multi_gpustart.ipynb

## multi_gpustart.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "37588552-b9d5-4113-9f53-ecc63d3da815",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import scanpy as sc\n",
    "import anndata\n",
    "\n",
    "import dask\n",
    "import time\n",
    "\n",
    "import os, wget\n",
    "\n",
    "from dask_cuda import initialize, LocalCUDACluster\n",
    "from dask.distributed import Client, default_client\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2f159c97-e435-40c5-ac7b-aa37a6812ced",
   "metadata": {},
   "outputs": [],
   "source": [
    "import rmm\n",
    "import cupy as cp\n",
    "\n",
    "from rmm.allocators.cupy import rmm_cupy_allocator\n",
    "\n",
    "def set_mem():\n",
    "    rmm.reinitialize(managed_memory=True)\n",
    "    cp.cuda.set_allocator(rmm_cupy_allocator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "49a27e3c-5473-42eb-a0e3-73a22a1a9e16",
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessing_gpus=\"0, 1, 2, 3, 4, 5, 6, 7\"\n",
    "#preprocessing_gpus=\"0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "41eb36fe-1a49-4f16-b8a5-e01cdf39a7c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 934 ms, sys: 875 ms, total: 1.81 s\n",
      "Wall time: 33 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-0a46410e-d004-11ee-8919-043f72ce7f82</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "\n",
       "        <tr>\n",
       "        \n",
       "            <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
       "            <td style=\"text-align: left;\"><strong>Cluster type:</strong> dask_cuda.LocalCUDACluster</td>\n",
       "        \n",
       "        </tr>\n",
       "\n",
       "        \n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\"></td>\n",
       "            </tr>\n",
       "        \n",
       "\n",
       "        </table>\n",
       "\n",
       "        \n",
       "            <button style=\"margin-bottom: 12px;\" data-commandlinker-command=\"dask:populate-and-launch-layout\" data-commandlinker-args='{\"url\": \"http://127.0.0.1:8787/status\" }'>\n",
       "                Launch dashboard in JupyterLab\n",
       "            </button>\n",
       "        \n",
       "\n",
       "        \n",
       "            <details>\n",
       "            <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
       "            <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
       "    </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCUDACluster</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">27a5eec8</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Workers:</strong> 8\n",
       "                </td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total threads:</strong> 8\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total memory:</strong> 1.86 TiB\n",
       "                </td>\n",
       "            </tr>\n",
       "            \n",
       "            <tr>\n",
       "    <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
       "    <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
       "</tr>\n",
       "\n",
       "            \n",
       "        </table>\n",
       "\n",
       "        <details>\n",
       "            <summary style=\"margin-bottom: 20px;\">\n",
       "                <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
       "            </summary>\n",
       "\n",
       "            <div style=\"\">\n",
       "    <div>\n",
       "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
       "        <div style=\"margin-left: 48px;\">\n",
       "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-f5323a92-56fa-4fc7-93ba-8c32d627e723</p>\n",
       "            <table style=\"width: 100%; text-align: left;\">\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Comm:</strong> tcp://127.0.0.1:42393\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Workers:</strong> 8\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total threads:</strong> 8\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Started:</strong> Just now\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total memory:</strong> 1.86 TiB\n",
       "                    </td>\n",
       "                </tr>\n",
       "            </table>\n",
       "        </div>\n",
       "    </div>\n",
       "\n",
       "    <details style=\"margin-left: 48px;\">\n",
       "        <summary style=\"margin-bottom: 20px;\">\n",
       "            <h3 style=\"display: inline;\">Workers</h3>\n",
       "        </summary>\n",
       "\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:41181\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33463/status\" target=\"_blank\">http://127.0.0.1:33463/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 237.50 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:34937\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-x_85b63u\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>NVIDIA A100-SXM4-80GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 80.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:39227\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41193/status\" target=\"_blank\">http://127.0.0.1:41193/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 237.50 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:35679\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-1w4snx39\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>NVIDIA A100-SXM4-80GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 80.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:42895\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:38303/status\" target=\"_blank\">http://127.0.0.1:38303/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 237.50 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:39305\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-frdj5cke\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>NVIDIA A100-SXM4-80GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 80.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:46437\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:34027/status\" target=\"_blank\">http://127.0.0.1:34027/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 237.50 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:36439\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-v01hcrk_\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>NVIDIA A100-SXM4-80GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 80.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 4</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:42063\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41045/status\" target=\"_blank\">http://127.0.0.1:41045/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 237.50 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:45531\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-lp1fcy5q\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>NVIDIA A100-SXM4-80GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 80.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 5</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:37545\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:46027/status\" target=\"_blank\">http://127.0.0.1:46027/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 237.50 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:43059\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-c2mqyb7l\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>NVIDIA A100-SXM4-80GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 80.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 6</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:43897\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41089/status\" target=\"_blank\">http://127.0.0.1:41089/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 237.50 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:39879\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-_0f_tcs6\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>NVIDIA A100-SXM4-80GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 80.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 7</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:39193\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37869/status\" target=\"_blank\">http://127.0.0.1:37869/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 237.50 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:44741\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-j7bszc0g\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>NVIDIA A100-SXM4-80GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 80.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "\n",
       "    </details>\n",
       "</div>\n",
       "\n",
       "        </details>\n",
       "    </div>\n",
       "</div>\n",
       "            </details>\n",
       "        \n",
       "\n",
       "    </div>\n",
       "</div>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:42393' processes=8 threads=8, memory=1.86 TiB>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES=preprocessing_gpus)\n",
    "client = Client(cluster)    \n",
    "\n",
    "set_mem()\n",
    "client.run(set_mem)\n",
    "\n",
    "client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b4ac3268-f54b-40c6-9f56-6f0d942c6753",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "import cuml\n",
    "import cupy as cp\n",
    "from cuml.dask.common.part_utils import _extract_partitions\n",
    "import math\n",
    "from cuml.internals.memory_utils import with_cupy_rmm\n",
    "import h5py\n",
    "import rapids_singlecell as rsc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c8541cf5-9a40-4b1a-baea-309bfdc8d15a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from cupyx.scipy import sparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87cb9613-bed9-4b7c-accc-e88b3d4bc415",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3fc9a948-80ea-457f-8ded-151505a23418",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<KeysViewHDF5 ['Biotype', 'Chromosome', 'End', 'Gene', 'Start', 'ensembl_ids', 'feature_biotype', 'feature_is_filtered', 'feature_length', 'feature_name', 'feature_reference']>\n",
      "CPU times: user 6.52 s, sys: 3.24 s, total: 9.76 s\n",
      "Wall time: 1min 28s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 548.59 GiB </td>\n",
       "                        <td> 11.06 GiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2480956, 59357) </td>\n",
       "                        <td> (50000, 59357) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 50 chunks in 1 graph layer </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float32 numpy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"80\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"30\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"4\" x2=\"30\" y2=\"4\" />\n",
       "  <line x1=\"0\" y1=\"12\" x2=\"30\" y2=\"12\" />\n",
       "  <line x1=\"0\" y1=\"16\" x2=\"30\" y2=\"16\" />\n",
       "  <line x1=\"0\" y1=\"24\" x2=\"30\" y2=\"24\" />\n",
       "  <line x1=\"0\" y1=\"31\" x2=\"30\" y2=\"31\" />\n",
       "  <line x1=\"0\" y1=\"36\" x2=\"30\" y2=\"36\" />\n",
       "  <line x1=\"0\" y1=\"43\" x2=\"30\" y2=\"43\" />\n",
       "  <line x1=\"0\" y1=\"50\" x2=\"30\" y2=\"50\" />\n",
       "  <line x1=\"0\" y1=\"55\" x2=\"30\" y2=\"55\" />\n",
       "  <line x1=\"0\" y1=\"62\" x2=\"30\" y2=\"62\" />\n",
       "  <line x1=\"0\" y1=\"67\" x2=\"30\" y2=\"67\" />\n",
       "  <line x1=\"0\" y1=\"74\" x2=\"30\" y2=\"74\" />\n",
       "  <line x1=\"0\" y1=\"82\" x2=\"30\" y2=\"82\" />\n",
       "  <line x1=\"0\" y1=\"87\" x2=\"30\" y2=\"87\" />\n",
       "  <line x1=\"0\" y1=\"94\" x2=\"30\" y2=\"94\" />\n",
       "  <line x1=\"0\" y1=\"101\" x2=\"30\" y2=\"101\" />\n",
       "  <line x1=\"0\" y1=\"106\" x2=\"30\" y2=\"106\" />\n",
       "  <line x1=\"0\" y1=\"113\" x2=\"30\" y2=\"113\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"30\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"30\" y1=\"0\" x2=\"30\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 30.05538364053684,0.0 30.05538364053684,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"15.027692\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >59357</text>\n",
       "  <text x=\"50.055384\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,50.055384,60.000000)\">2480956</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<concatenate, shape=(2480956, 59357), dtype=float32, chunksize=(50000, 59357), chunktype=numpy.ndarray>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "def read_with_filter(client,\n",
    "                     sample_file, batch_size = 50000):\n",
    "    \"\"\"\n",
    "    Reads an h5ad file and applies cell and geans count filter. Dask Array is\n",
    "    used allow partitioning the input file. This function supports multi-GPUs.\n",
    "    \"\"\"\n",
    "\n",
    "    # Path in h5 file\n",
    "    _data = '/X/data'\n",
    "    _index = '/X/indices'\n",
    "    _indprt = '/X/indptr'\n",
    "    _genes = '/var/ensembl_ids'\n",
    "    #_genes = '/var/ensembl_id'\n",
    "    #_genes = '/var/_index'\n",
    "    #_genes = '/var/feature_id'\n",
    "    _barcodes = '/obs/_index'\n",
    "\n",
    "    @dask.delayed\n",
    "    def _read_partition_to_sparse_matrix(sample_file,\n",
    "                                         total_cols, batch_start, batch_end,\n",
    "                                         ):\n",
    "        with h5py.File(sample_file, 'r') as h5f:\n",
    "            indptrs = h5f[_indprt]\n",
    "            start_ptr = indptrs[batch_start]\n",
    "            end_ptr = indptrs[batch_end]\n",
    "\n",
    "            # Read all things data and index\n",
    "            sub_data = cp.array(h5f[_data][start_ptr:end_ptr])\n",
    "            sub_indices = cp.array(h5f[_index][start_ptr:end_ptr])\n",
    "\n",
    "            # recompute the row pointer for the partial dataset\n",
    "            sub_indptrs  = cp.array(indptrs[batch_start:(batch_end + 1)])\n",
    "            sub_indptrs = sub_indptrs - sub_indptrs[0]\n",
    "\n",
    "        # Reconstruct partial sparse array\n",
    "        partial_sparse_array = cp.sparse.csr_matrix(\n",
    "            (sub_data, sub_indices, sub_indptrs),\n",
    "            shape=(batch_end - batch_start, total_cols))\n",
    "            \n",
    "        return partial_sparse_array\n",
    "\n",
    "\n",
    "    with h5py.File(sample_file, 'r') as h5f:\n",
    "        # Compute the number of cells to read\n",
    "        indptr = h5f[_indprt]\n",
    "        vars= h5f[\"/var/\"]\n",
    "        print(vars.keys())\n",
    "        genes = cudf.Series(h5f[_genes], dtype=cp.dtype('object'))\n",
    "\n",
    "        total_cols = genes.shape[0]\n",
    "        max_cells = indptr.shape[0] - 1\n",
    "\n",
    "    dls = []\n",
    "    for batch_start in range(0, max_cells, batch_size):\n",
    "        actual_batch_size = min(batch_size, max_cells - batch_start)\n",
    "        dls.append(dask.array.from_delayed(\n",
    "                   (_read_partition_to_sparse_matrix)\n",
    "                   (sample_file,\n",
    "                    total_cols,\n",
    "                    batch_start,\n",
    "                    batch_start + actual_batch_size),\n",
    "                   dtype=cp.float32,\n",
    "                   shape=(actual_batch_size, total_cols)))\n",
    "\n",
    "    dask_sparse_arr =  dask.array.concatenate(dls)\n",
    "    dask_sparse_arr = dask_sparse_arr.persist()\n",
    "    return dask_sparse_arr\n",
    "\n",
    "dask_sparse_arr = read_with_filter(client, \"h5/human_brain.h5ad\", batch_size=50000)\n",
    "dask_sparse_arr = dask_sparse_arr.persist()\n",
    "\n",
    "dask_sparse_arr.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a7976fd6-24d0-4dc9-a2d9-ab2bf0fb05c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "@with_cupy_rmm\n",
    "def calc_qc_dask(client, csr_matrix, axis=0):\n",
    "    '''\n",
    "    Implements sum operation for dask array when the backend is cupy sparse csr matrix\n",
    "    '''\n",
    "    from rapids_singlecell.preprocessing._kernels._qc_kernels import _sparse_qc_csr\n",
    "    sparse_qc_csr = _sparse_qc_csr(csr_matrix.dtype)\n",
    "    sparse_qc_csr.compile()\n",
    "\n",
    "    def __qc_calc(X):\n",
    "        sums_cells = cp.zeros(X.shape[0], dtype=X.dtype)\n",
    "        sums_genes = cp.zeros((X.shape[1],1), dtype=X.dtype)\n",
    "        cell_ex = cp.zeros(X.shape[0], dtype=cp.int32)\n",
    "        gene_ex = cp.zeros((X.shape[1],1), dtype=cp.int32)\n",
    "        block = (32,)\n",
    "        grid = (int(math.ceil(X.shape[0] / block[0])),)\n",
    "        sparse_qc_csr(\n",
    "        grid,\n",
    "        block,\n",
    "        (\n",
    "            X.indptr,\n",
    "            X.indices,\n",
    "            X.data,\n",
    "            sums_cells,\n",
    "            sums_genes,\n",
    "            cell_ex,\n",
    "            gene_ex,\n",
    "            X.shape[0],\n",
    "            ),\n",
    "        )\n",
    "        return sums_cells,sums_genes,cell_ex,gene_ex\n",
    "    parts = client.sync(_extract_partitions, csr_matrix)\n",
    "    futures = [client.submit(__qc_calc, part, workers=[w]) for w, part in parts]\n",
    "    # Gather results from futures\n",
    "    results = client.gather(futures)\n",
    "\n",
    "    # Initialize lists to hold the Dask arrays\n",
    "    sums_cells_objs = []\n",
    "    sums_genes_objs = []\n",
    "    cell_ex_objs = []\n",
    "    gene_ex_objs = []\n",
    "\n",
    "    # Process each result\n",
    "    for sums_cells, sums_genes, cell_ex, gene_ex in results:\n",
    "        # Append the arrays to their respective lists as Dask arrays\n",
    "        sums_cells_objs.append(dask.array.from_array(sums_cells, chunks=sums_cells.shape))\n",
    "        sums_genes_objs.append(dask.array.from_array(sums_genes, chunks=sums_genes.shape))\n",
    "        cell_ex_objs.append(dask.array.from_array(cell_ex, chunks=cell_ex.shape))\n",
    "        gene_ex_objs.append(dask.array.from_array(gene_ex, chunks=gene_ex.shape))\n",
    "    sums_cells = dask.array.concatenate(sums_cells_objs).compute().ravel()\n",
    "    sums_genes = dask.array.concatenate(sums_genes_objs,axis=1).compute().sum(axis=1).ravel()\n",
    "    cell_ex = dask.array.concatenate(cell_ex_objs).compute().ravel()\n",
    "    gene_ex = dask.array.concatenate(gene_ex_objs,axis=1).compute().sum(axis=1).ravel()\n",
    "    return sums_cells, sums_genes, cell_ex, gene_ex\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2b9a4f2c-7d79-4316-b3b5-3b447322b1af",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sedi10/conda/envs/rapids-23.12/lib/python3.10/site-packages/distributed/client.py:3163: UserWarning: Sending large graph of size 11.34 MiB.\n",
      "This may cause some slowdown.\n",
      "Consider scattering data ahead of time and using futures.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 825 ms, sys: 214 ms, total: 1.04 s\n",
      "Wall time: 1.29 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "sums_cells, sums_genes, cell_ex, gene_ex = calc_qc_dask(client, dask_sparse_arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3250452d-fa10-47bf-9f00-9ef6e52fdf56",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.36 ms, sys: 2.5 ms, total: 6.86 ms\n",
      "Wall time: 18.4 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "qc_cells = (cell_ex <= 10000) & (200 <= cell_ex)\n",
    "qc_genes = (10 <= gene_ex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7772b50c-2e75-42e0-be7b-39db3c5bd8c6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<timed exec>:1: PerformanceWarning: Slicing is producing a large chunk. To accept the large\n",
      "chunk and silence this warning, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n",
      "    ...     array[indexer]\n",
      "\n",
      "To avoid creating the large chunks, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n",
      "    ...     array[indexer]\n",
      "/home/sedi10/conda/envs/rapids-23.12/lib/python3.10/site-packages/distributed/client.py:3163: UserWarning: Sending large graph of size 18.56 MiB.\n",
      "This may cause some slowdown.\n",
      "Consider scattering data ahead of time and using futures.\n",
      "  warnings.warn(\n",
      "<timed exec>:2: PerformanceWarning: Slicing is producing a large chunk. To accept the large\n",
      "chunk and silence this warning, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n",
      "    ...     array[indexer]\n",
      "\n",
      "To avoid creating the large chunks, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n",
      "    ...     array[indexer]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 824 ms, sys: 626 ms, total: 1.45 s\n",
      "Wall time: 15.5 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 426.35 GiB </td>\n",
       "                        <td> 8.77 GiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2432024, 47059) </td>\n",
       "                        <td> (50000, 47059) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 50 chunks in 1 graph layer </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float32 numpy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"78\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"28\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"4\" x2=\"28\" y2=\"4\" />\n",
       "  <line x1=\"0\" y1=\"12\" x2=\"28\" y2=\"12\" />\n",
       "  <line x1=\"0\" y1=\"17\" x2=\"28\" y2=\"17\" />\n",
       "  <line x1=\"0\" y1=\"24\" x2=\"28\" y2=\"24\" />\n",
       "  <line x1=\"0\" y1=\"31\" x2=\"28\" y2=\"31\" />\n",
       "  <line x1=\"0\" y1=\"36\" x2=\"28\" y2=\"36\" />\n",
       "  <line x1=\"0\" y1=\"43\" x2=\"28\" y2=\"43\" />\n",
       "  <line x1=\"0\" y1=\"51\" x2=\"28\" y2=\"51\" />\n",
       "  <line x1=\"0\" y1=\"55\" x2=\"28\" y2=\"55\" />\n",
       "  <line x1=\"0\" y1=\"63\" x2=\"28\" y2=\"63\" />\n",
       "  <line x1=\"0\" y1=\"68\" x2=\"28\" y2=\"68\" />\n",
       "  <line x1=\"0\" y1=\"75\" x2=\"28\" y2=\"75\" />\n",
       "  <line x1=\"0\" y1=\"82\" x2=\"28\" y2=\"82\" />\n",
       "  <line x1=\"0\" y1=\"87\" x2=\"28\" y2=\"87\" />\n",
       "  <line x1=\"0\" y1=\"94\" x2=\"28\" y2=\"94\" />\n",
       "  <line x1=\"0\" y1=\"101\" x2=\"28\" y2=\"101\" />\n",
       "  <line x1=\"0\" y1=\"106\" x2=\"28\" y2=\"106\" />\n",
       "  <line x1=\"0\" y1=\"113\" x2=\"28\" y2=\"113\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"28\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"28\" y1=\"0\" x2=\"28\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 28.845202251346922,0.0 28.845202251346922,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"14.422601\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >47059</text>\n",
       "  <text x=\"48.845202\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,48.845202,60.000000)\">2432024</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<getitem, shape=(2432024, 47059), dtype=float32, chunksize=(50000, 47059), chunktype=numpy.ndarray>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dask_sparse_arr = dask_sparse_arr[qc_cells,:].persist()\n",
    "dask_sparse_arr = dask_sparse_arr[:,qc_genes].persist()\n",
    "dask_sparse_arr.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fcaa4d7c-743c-449a-b895-63af2cc5c29b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 69 ms, sys: 4.14 ms, total: 73.2 ms\n",
      "Wall time: 84.6 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 426.35 GiB </td>\n",
       "                        <td> 8.77 GiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2432024, 47059) </td>\n",
       "                        <td> (50000, 47059) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 50 chunks in 1 graph layer </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float32 cupy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"78\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"28\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"4\" x2=\"28\" y2=\"4\" />\n",
       "  <line x1=\"0\" y1=\"12\" x2=\"28\" y2=\"12\" />\n",
       "  <line x1=\"0\" y1=\"17\" x2=\"28\" y2=\"17\" />\n",
       "  <line x1=\"0\" y1=\"24\" x2=\"28\" y2=\"24\" />\n",
       "  <line x1=\"0\" y1=\"31\" x2=\"28\" y2=\"31\" />\n",
       "  <line x1=\"0\" y1=\"36\" x2=\"28\" y2=\"36\" />\n",
       "  <line x1=\"0\" y1=\"43\" x2=\"28\" y2=\"43\" />\n",
       "  <line x1=\"0\" y1=\"51\" x2=\"28\" y2=\"51\" />\n",
       "  <line x1=\"0\" y1=\"55\" x2=\"28\" y2=\"55\" />\n",
       "  <line x1=\"0\" y1=\"63\" x2=\"28\" y2=\"63\" />\n",
       "  <line x1=\"0\" y1=\"68\" x2=\"28\" y2=\"68\" />\n",
       "  <line x1=\"0\" y1=\"75\" x2=\"28\" y2=\"75\" />\n",
       "  <line x1=\"0\" y1=\"82\" x2=\"28\" y2=\"82\" />\n",
       "  <line x1=\"0\" y1=\"87\" x2=\"28\" y2=\"87\" />\n",
       "  <line x1=\"0\" y1=\"94\" x2=\"28\" y2=\"94\" />\n",
       "  <line x1=\"0\" y1=\"101\" x2=\"28\" y2=\"101\" />\n",
       "  <line x1=\"0\" y1=\"106\" x2=\"28\" y2=\"106\" />\n",
       "  <line x1=\"0\" y1=\"113\" x2=\"28\" y2=\"113\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"28\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"28\" y1=\"0\" x2=\"28\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 28.845202251346922,0.0 28.845202251346922,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"14.422601\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >47059</text>\n",
       "  <text x=\"48.845202\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,48.845202,60.000000)\">2432024</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<lambda, shape=(2432024, 47059), dtype=float32, chunksize=(50000, 47059), chunktype=cupy.ndarray>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "from rapids_singlecell.preprocessing._kernels._norm_kernel import _mul_csr\n",
    "\n",
    "mul_kernel = _mul_csr(dask_sparse_arr.dtype)\n",
    "mul_kernel.compile()\n",
    "def norm(X, target_sum = 10000):\n",
    "    mul_kernel(\n",
    "        (math.ceil(X.shape[0] / 128),),\n",
    "        (128,),\n",
    "        (X.indptr, X.data, X.shape[0], int(target_sum)),\n",
    "    )\n",
    "    return X\n",
    "dask_sparse_arr = dask_sparse_arr.map_blocks(lambda X: norm(X),dtype=cp.float32,meta=cp.array((0,),dtype=dask_sparse_arr.dtype))\n",
    "dask_sparse_arr = dask_sparse_arr.persist()\n",
    "dask_sparse_arr.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "28107a4a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 74.7 ms, sys: 10.7 ms, total: 85.4 ms\n",
      "Wall time: 194 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 426.35 GiB </td>\n",
       "                        <td> 8.77 GiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2432024, 47059) </td>\n",
       "                        <td> (50000, 47059) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 50 chunks in 1 graph layer </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float32 cupy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"78\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"28\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"4\" x2=\"28\" y2=\"4\" />\n",
       "  <line x1=\"0\" y1=\"12\" x2=\"28\" y2=\"12\" />\n",
       "  <line x1=\"0\" y1=\"17\" x2=\"28\" y2=\"17\" />\n",
       "  <line x1=\"0\" y1=\"24\" x2=\"28\" y2=\"24\" />\n",
       "  <line x1=\"0\" y1=\"31\" x2=\"28\" y2=\"31\" />\n",
       "  <line x1=\"0\" y1=\"36\" x2=\"28\" y2=\"36\" />\n",
       "  <line x1=\"0\" y1=\"43\" x2=\"28\" y2=\"43\" />\n",
       "  <line x1=\"0\" y1=\"51\" x2=\"28\" y2=\"51\" />\n",
       "  <line x1=\"0\" y1=\"55\" x2=\"28\" y2=\"55\" />\n",
       "  <line x1=\"0\" y1=\"63\" x2=\"28\" y2=\"63\" />\n",
       "  <line x1=\"0\" y1=\"68\" x2=\"28\" y2=\"68\" />\n",
       "  <line x1=\"0\" y1=\"75\" x2=\"28\" y2=\"75\" />\n",
       "  <line x1=\"0\" y1=\"82\" x2=\"28\" y2=\"82\" />\n",
       "  <line x1=\"0\" y1=\"87\" x2=\"28\" y2=\"87\" />\n",
       "  <line x1=\"0\" y1=\"94\" x2=\"28\" y2=\"94\" />\n",
       "  <line x1=\"0\" y1=\"101\" x2=\"28\" y2=\"101\" />\n",
       "  <line x1=\"0\" y1=\"106\" x2=\"28\" y2=\"106\" />\n",
       "  <line x1=\"0\" y1=\"113\" x2=\"28\" y2=\"113\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"28\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"28\" y1=\"0\" x2=\"28\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 28.845202251346922,0.0 28.845202251346922,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"14.422601\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >47059</text>\n",
       "  <text x=\"48.845202\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,48.845202,60.000000)\">2432024</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<lambda, shape=(2432024, 47059), dtype=float32, chunksize=(50000, 47059), chunktype=cupy.ndarray>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dask_sparse_arr = dask_sparse_arr.map_blocks(lambda X: X.log1p(),dtype=cp.float32, meta=cp.array((0,),dtype=dask_sparse_arr.dtype))\n",
    "dask_sparse_arr = dask_sparse_arr.persist()\n",
    "dask_sparse_arr.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "cf816ba6-55d2-45ae-aad9-5f3b904aae01",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "39316898-d771-45a1-a675-e60279a96ab5",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "@with_cupy_rmm\n",
    "def get_mean_var_dask(client, csr_matrix):\n",
    "    '''\n",
    "    Implements sum operation for dask array when the backend is cupy sparse csr matrix\n",
    "    '''\n",
    "    from rapids_singlecell.preprocessing._kernels._mean_var_kernel import _get_mean_var_minor\n",
    "    get_mean_var_minor = _get_mean_var_minor(csr_matrix.dtype)\n",
    "    get_mean_var_minor.compile()\n",
    "\n",
    "    def __mean_var(X, minor, major):\n",
    "        mean = cp.zeros((minor,1), dtype=cp.float64)\n",
    "        var = cp.zeros((minor,1), dtype=cp.float64)\n",
    "        block = (32,)\n",
    "        grid = (int(math.ceil(X.nnz / block[0])),)\n",
    "        get_mean_var_minor(grid, block, (X.indices, X.data, mean, var, major, X.nnz))\n",
    "        return mean,var\n",
    "    major = csr_matrix.shape[0]\n",
    "    minor = csr_matrix.shape[1]\n",
    "    parts = client.sync(_extract_partitions, csr_matrix)\n",
    "    futures = [client.submit(__mean_var, part,minor, major, workers=[w]) for w, part in parts]\n",
    "    # Gather results from futures\n",
    "    results = client.gather(futures)\n",
    "\n",
    "    # Initialize lists to hold the Dask arrays\n",
    "    means_objs = []\n",
    "    var_objs = []\n",
    "\n",
    "    # Process each result\n",
    "    for means, vars in results:\n",
    "        # Append the arrays to their respective lists as Dask arrays\n",
    "        means_objs.append(dask.array.from_array(means, chunks=means.shape))\n",
    "        var_objs.append(dask.array.from_array(vars, chunks=vars.shape))\n",
    "    mean = dask.array.concatenate(means_objs,axis=1).compute().sum(axis=1).ravel()\n",
    "    var = dask.array.concatenate(var_objs,axis=1).compute().sum(axis=1).ravel()\n",
    "\n",
    "    var = (var - mean**2) * (major / (major - 1))\n",
    "    return mean, var\n",
    "\n",
    "def highly_variable_genes_filter(client,\n",
    "                                 data_mat,\n",
    "                                 n_top_genes=None):\n",
    "\n",
    "\n",
    "    mean, variance = get_mean_var_dask(client, data_mat)\n",
    "    dispersion = variance / mean\n",
    "\n",
    "    df = pd.DataFrame()\n",
    "    df['genes'] = np.arange(data_mat.shape[1])\n",
    "    df['means'] = mean.tolist()\n",
    "    df['dispersions'] = dispersion.tolist()\n",
    "    df['mean_bin'] = pd.cut(\n",
    "        df['means'],\n",
    "        np.r_[-np.inf, np.percentile(df['means'], np.arange(10, 105, 5)), np.inf],\n",
    "    )\n",
    "\n",
    "    disp_grouped = df.groupby('mean_bin')['dispersions']\n",
    "    disp_median_bin = disp_grouped.median()\n",
    "\n",
    "    with warnings.catch_warnings():\n",
    "        from statsmodels import robust\n",
    "        warnings.simplefilter('ignore')\n",
    "        disp_mad_bin = disp_grouped.apply(robust.mad)\n",
    "        df['dispersions_norm'] = (\n",
    "            df['dispersions'].values - disp_median_bin[df['mean_bin'].values].values\n",
    "        ) / disp_mad_bin[df['mean_bin'].values].values\n",
    "\n",
    "    dispersion_norm = df['dispersions_norm'].values\n",
    "\n",
    "    dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)]\n",
    "    dispersion_norm[::-1].sort()\n",
    "\n",
    "    if n_top_genes > df.shape[0]:\n",
    "        n_top_genes = df.shape[0]\n",
    "\n",
    "    disp_cut_off = dispersion_norm[n_top_genes - 1]\n",
    "    vaiable_genes = np.nan_to_num(df['dispersions_norm'].values) >= disp_cut_off\n",
    "\n",
    "    return vaiable_genes\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "bb84f792-2b06-4bfa-8cca-3b4eaec042ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sedi10/conda/envs/rapids-23.12/lib/python3.10/site-packages/distributed/client.py:3163: UserWarning: Sending large graph of size 17.97 MiB.\n",
      "This may cause some slowdown.\n",
      "Consider scattering data ahead of time and using futures.\n",
      "  warnings.warn(\n",
      "<timed exec>:2: PerformanceWarning: Slicing is producing a large chunk. To accept the large\n",
      "chunk and silence this warning, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n",
      "    ...     array[indexer]\n",
      "\n",
      "To avoid creating the large chunks, set the option\n",
      "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n",
      "    ...     array[indexer]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.6 s, sys: 503 ms, total: 2.11 s\n",
      "Wall time: 10.3 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 45.30 GiB </td>\n",
       "                        <td> 0.93 GiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2432024, 5000) </td>\n",
       "                        <td> (50000, 5000) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 50 chunks in 1 graph layer </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float32 cupy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"75\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"25\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"4\" x2=\"25\" y2=\"4\" />\n",
       "  <line x1=\"0\" y1=\"12\" x2=\"25\" y2=\"12\" />\n",
       "  <line x1=\"0\" y1=\"17\" x2=\"25\" y2=\"17\" />\n",
       "  <line x1=\"0\" y1=\"24\" x2=\"25\" y2=\"24\" />\n",
       "  <line x1=\"0\" y1=\"31\" x2=\"25\" y2=\"31\" />\n",
       "  <line x1=\"0\" y1=\"36\" x2=\"25\" y2=\"36\" />\n",
       "  <line x1=\"0\" y1=\"43\" x2=\"25\" y2=\"43\" />\n",
       "  <line x1=\"0\" y1=\"51\" x2=\"25\" y2=\"51\" />\n",
       "  <line x1=\"0\" y1=\"55\" x2=\"25\" y2=\"55\" />\n",
       "  <line x1=\"0\" y1=\"63\" x2=\"25\" y2=\"63\" />\n",
       "  <line x1=\"0\" y1=\"68\" x2=\"25\" y2=\"68\" />\n",
       "  <line x1=\"0\" y1=\"75\" x2=\"25\" y2=\"75\" />\n",
       "  <line x1=\"0\" y1=\"82\" x2=\"25\" y2=\"82\" />\n",
       "  <line x1=\"0\" y1=\"87\" x2=\"25\" y2=\"87\" />\n",
       "  <line x1=\"0\" y1=\"94\" x2=\"25\" y2=\"94\" />\n",
       "  <line x1=\"0\" y1=\"101\" x2=\"25\" y2=\"101\" />\n",
       "  <line x1=\"0\" y1=\"106\" x2=\"25\" y2=\"106\" />\n",
       "  <line x1=\"0\" y1=\"113\" x2=\"25\" y2=\"113\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"25\" y1=\"0\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 25.412616514582485,0.0 25.412616514582485,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"12.706308\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >5000</text>\n",
       "  <text x=\"45.412617\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,45.412617,60.000000)\">2432024</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<getitem, shape=(2432024, 5000), dtype=float32, chunksize=(50000, 5000), chunktype=cupy.ndarray>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "genes = highly_variable_genes_filter(client, dask_sparse_arr, n_top_genes=5000)\n",
    "dask_sparse_arr = dask_sparse_arr[:,genes].persist()\n",
    "dask_sparse_arr.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "75761f4c-c178-49b8-a716-aefc5f31b276",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.18 s, sys: 1.07 s, total: 2.26 s\n",
      "Wall time: 28.2 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 90.60 GiB </td>\n",
       "                        <td> 11.32 GiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2432024, 5000) </td>\n",
       "                        <td> (304003, 5000) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 8 chunks in 1 graph layer </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float64 cupy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"75\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"25\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"15\" x2=\"25\" y2=\"15\" />\n",
       "  <line x1=\"0\" y1=\"30\" x2=\"25\" y2=\"30\" />\n",
       "  <line x1=\"0\" y1=\"45\" x2=\"25\" y2=\"45\" />\n",
       "  <line x1=\"0\" y1=\"60\" x2=\"25\" y2=\"60\" />\n",
       "  <line x1=\"0\" y1=\"75\" x2=\"25\" y2=\"75\" />\n",
       "  <line x1=\"0\" y1=\"90\" x2=\"25\" y2=\"90\" />\n",
       "  <line x1=\"0\" y1=\"105\" x2=\"25\" y2=\"105\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"25\" y1=\"0\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 25.412616514582485,0.0 25.412616514582485,120.0 0.0,120.0\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"12.706308\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >5000</text>\n",
       "  <text x=\"45.412617\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,45.412617,60.000000)\">2432024</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<rechunk-merge, shape=(2432024, 5000), dtype=float64, chunksize=(304003, 5000), chunktype=cupy.ndarray>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dense_array = dask_sparse_arr.map_blocks(lambda x: x.todense().astype(cp.float64), dtype=cp.float64,meta=cp.array((0,),dtype=cp.float64))\n",
    "\n",
    "n_rows = dense_array.shape[0]\n",
    "n_cols = dense_array.shape[1]\n",
    "cols_per_worker = int(n_rows / 8)\n",
    "dense_array = dense_array.rechunk((cols_per_worker, n_cols)).persist()\n",
    "\n",
    "dense_array.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "ae91f9cd-bbaf-42f9-a75b-d2a363f457a2",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.17 s, sys: 1.12 s, total: 2.29 s\n",
      "Wall time: 36.4 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 1.81 GiB </td>\n",
       "                        <td> 231.94 MiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2432024, 100) </td>\n",
       "                        <td> (304003, 100) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 8 chunks in 19 graph layers </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float64 cupy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"75\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"25\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"15\" x2=\"25\" y2=\"15\" />\n",
       "  <line x1=\"0\" y1=\"30\" x2=\"25\" y2=\"30\" />\n",
       "  <line x1=\"0\" y1=\"45\" x2=\"25\" y2=\"45\" />\n",
       "  <line x1=\"0\" y1=\"60\" x2=\"25\" y2=\"60\" />\n",
       "  <line x1=\"0\" y1=\"75\" x2=\"25\" y2=\"75\" />\n",
       "  <line x1=\"0\" y1=\"90\" x2=\"25\" y2=\"90\" />\n",
       "  <line x1=\"0\" y1=\"105\" x2=\"25\" y2=\"105\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"25\" y1=\"0\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 25.41261651458249,0.0 25.41261651458249,120.0 0.0,120.0\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"12.706308\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >100</text>\n",
       "  <text x=\"45.412617\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,45.412617,60.000000)\">2432024</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<concatenate, shape=(2432024, 100), dtype=float64, chunksize=(304003, 100), chunktype=cupy.ndarray>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "from cuml.dask.decomposition import PCA\n",
    "pca_func = PCA(n_components=100)\n",
    "pca_data_d = pca_func.fit_transform(dense_array)\n",
    "pca_data_d.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "b8ff7564-0b43-40e8-b920-d22ef9943729",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f0ece51-4022-4ffc-9b38-ff5ba2480f2a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "776d53f2-8a77-4f0e-addb-444068718821",
   "metadata": {},
   "outputs": [],
   "source": [
    "from cupyx import cusparse\n",
    "class PCA_sparse_dask:\n",
    "    def __init__(self, n_components, client, whiten = False) -> None:\n",
    "        self.n_components = n_components\n",
    "        self.client  = client\n",
    "        self.whiten = whiten\n",
    "        \n",
    "    def fit(self, x):\n",
    "        if self.n_components is None:\n",
    "            n_rows = x.shape[0]\n",
    "            n_cols = x.shape[1]\n",
    "            self.n_components_ = min(n_rows, n_cols)\n",
    "        else:\n",
    "            self.n_components_ = self.n_components\n",
    "\n",
    "        self.n_samples_ = x.shape[0]\n",
    "        self.n_features_in_ = x.shape[1] if x.ndim == 2 else 1\n",
    "        self.dtype = x.dtype\n",
    "        covariance, self.mean_, _ = _cov_sparse_dask(self.client, x=x, return_mean=True)\n",
    "        self.explained_variance_, self.components_ = cp.linalg.eigh(\n",
    "            covariance, UPLO=\"U\"\n",
    "        )\n",
    "        # NOTE: We reverse the eigen vector and eigen values here\n",
    "        # because cupy provides them in ascending order. Make a copy otherwise\n",
    "        # it is not C_CONTIGUOUS anymore and would error when converting to\n",
    "        # CumlArray\n",
    "        self.explained_variance_ = self.explained_variance_[::-1]\n",
    "\n",
    "        self.components_ = cp.flip(self.components_, axis=1)\n",
    "\n",
    "        self.components_ = self.components_.T[: self.n_components_, :]\n",
    "\n",
    "        self.explained_variance_ratio_ = self.explained_variance_ / cp.sum(\n",
    "            self.explained_variance_\n",
    "        )\n",
    "        if self.n_components_ < min(self.n_samples_, self.n_features_in_):\n",
    "            self.noise_variance_ = \\\n",
    "                self.explained_variance_[self.n_components_:].mean()\n",
    "        else:\n",
    "            self.noise_variance_ = cp.array([0.0])\n",
    "        self.explained_variance_ = self.explained_variance_[: self.n_components_]\n",
    "\n",
    "        self.explained_variance_ratio_ = self.explained_variance_ratio_[\n",
    "            : self.n_components_\n",
    "        ]\n",
    "        # Truncating negative explained variance values to 0\n",
    "        self.singular_values_ = \\\n",
    "            cp.where(self.explained_variance_ < 0, 0,\n",
    "                     self.explained_variance_)\n",
    "        self.singular_values_ = \\\n",
    "            cp.sqrt(self.singular_values_ * (self.n_samples_ - 1))\n",
    "        return self\n",
    "\n",
    "    def transform(self, X):\n",
    "\n",
    "        if self.whiten:\n",
    "            self.components_ *= cp.sqrt(self.n_samples_ - 1)\n",
    "            self.components_ /= self.singular_values_.reshape((-1, 1))\n",
    "        \n",
    "        def _transform(X_part, mean_, components_):\n",
    "            dense = cusparse.csr2dense(X_part)\n",
    "            dense = dense - mean_\n",
    "            X_pca = dense.dot(components_.T)\n",
    "            return X_pca\n",
    "\n",
    "        X_pca = X.map_blocks(_transform, \n",
    "                             mean_=self.mean_, \n",
    "                             components_=self.components_, \n",
    "                             dtype=X.dtype, \n",
    "                             meta=cp.array((0,),dtype=X.dtype))\n",
    "\n",
    "        if self.whiten:\n",
    "            self.components_ *= self.singular_values_.reshape((-1, 1))\n",
    "            self.components_ *= (1 / cp.sqrt(self.n_samples_ - 1))\n",
    "\n",
    "        #self.components_ = self.components_.get()\n",
    "        #self.explained_variance_ = self.explained_variance_.get()\n",
    "        #self.explained_variance_ratio_ = self.explained_variance_ratio_.get()\n",
    "        return X_pca.persist()\n",
    "\n",
    "    def fit_transform(self, X, y=None):\n",
    "        return self.fit(X).transform(X)\n",
    "\n",
    "    def inverse_transform(self, X, return_sparse=False,\n",
    "                                  sparse_tol=1e-10):\n",
    "\n",
    "        # NOTE: All intermediate calculations are done using cupy.ndarray and\n",
    "        # then converted to CumlArray at the end to minimize conversions\n",
    "        # between types\n",
    "\n",
    "        if self.whiten:\n",
    "            cp.multiply(self.components_,\n",
    "                        (1 / cp.sqrt(self.n_samples_ - 1)),\n",
    "                        out=self.components_)\n",
    "            cp.multiply(self.components_,\n",
    "                        self.singular_values_.reshape((-1, 1)),\n",
    "                        out=self.components_)\n",
    "\n",
    "        def _inv_transform(X_part, mean_, components_):\n",
    "            X_inv = cp.dot(X_part, self.components_)\n",
    "            cp.add(X_inv, self.mean_, out=X_inv)\n",
    "            return X_inv\n",
    "\n",
    "        X_inv = X.map_blocks(_inv_transform, \n",
    "                             mean_=self.mean_, \n",
    "                             components_=self.components_, \n",
    "                             dtype=X.dtype, \n",
    "                             meta=cp.array((0,),dtype=X.dtype))\n",
    "        X_inv = X_inv.persist()\n",
    "\n",
    "        if self.whiten:\n",
    "            self.components_ /= self.singular_values_.reshape((-1, 1))\n",
    "            self.components_ *= cp.sqrt(self.n_samples_ - 1)\n",
    "\n",
    "        if return_sparse:\n",
    "            def _ret_sparse(X_part, sparse_tol):\n",
    "                X_part = cp.where(X_part < sparse_tol, 0, X_inv)\n",
    "                X_part = cupyx.scipy.sparse.csr_matrix(X_inv)\n",
    "                return X_part\n",
    "\n",
    "            X_inv = X_inv.map_blocks(_ret_sparse, sparse_tol=sparse_tol, dtype=X_inv.dtype)\n",
    "            return X_inv.persist()\n",
    "\n",
    "        return X_inv.persist()\n",
    "\n",
    "\n",
    "@with_cupy_rmm\n",
    "def _cov_sparse_dask(client, x, return_gram=False, return_mean=False):\n",
    "    \"\"\"\n",
    "    Computes the mean and the covariance of matrix X of\n",
    "    the form Cov(X, X) = E(XX) - E(X)E(X)\n",
    "\n",
    "    This is a temporary fix for\n",
    "    cuml issue #5475 and cupy issue #7699,\n",
    "    where the operation `x.T.dot(x)` did not work for\n",
    "    larger sparse matrices.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "\n",
    "    x : cupyx.scipy.sparse of size (m, n)\n",
    "    return_gram : boolean (default = False)\n",
    "        If True, gram matrix of the form (1 / n) * X.T.dot(X)\n",
    "        will be returned.\n",
    "        When True, a copy will be created\n",
    "        to store the results of the covariance.\n",
    "        When False, the local gram matrix result\n",
    "        will be overwritten\n",
    "    return_mean: boolean (default = False)\n",
    "        If True, the Maximum Likelihood Estimate used to\n",
    "        calculate the mean of X and X will be returned,\n",
    "        of the form (1 / n) * mean(X) and (1 / n) * mean(X)\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "\n",
    "    result : cov(X, X) when return_gram and return_mean are False\n",
    "            cov(X, X), gram(X, X) when return_gram is True,\n",
    "            return_mean is False\n",
    "            cov(X, X), mean(X), mean(X) when return_gram is False,\n",
    "            return_mean is True\n",
    "            cov(X, X), gram(X, X), mean(X), mean(X)\n",
    "            when return_gram is True and return_mean is True\n",
    "    \"\"\"\n",
    "\n",
    "    from rapids_singlecell.preprocessing._kernels._pca_sparse_kernel import (\n",
    "        _copy_kernel,\n",
    "        _cov_kernel,\n",
    "        _gramm_kernel_csr,\n",
    "    )\n",
    "\n",
    "    compute_mean_cov = _gramm_kernel_csr(x.dtype)\n",
    "    compute_mean_cov.compile()\n",
    "\n",
    "    def __gram_block(x_part, n_cols):\n",
    "        gram_matrix = cp.zeros((n_cols, n_cols), dtype=x.dtype)\n",
    "    \n",
    "        block = (128,)\n",
    "        grid = (x_part.shape[0],)\n",
    "        compute_mean_cov(\n",
    "            grid,\n",
    "            block,\n",
    "            (\n",
    "                x_part.indptr,\n",
    "                x_part.indices,\n",
    "                x_part.data,\n",
    "                x_part.shape[0],\n",
    "                n_cols,\n",
    "                gram_matrix,\n",
    "            ),\n",
    "        )\n",
    "        return gram_matrix\n",
    "\n",
    "    parts = client.sync(_extract_partitions, x)\n",
    "    futures = [client.submit(__gram_block, part,x.shape[1], workers=[w]) for w, part in parts]\n",
    "    # Gather results from futures\n",
    "    objs = []\n",
    "    for i in range(len(futures)):\n",
    "        obj = dask.array.from_delayed(futures[i],\n",
    "                                      shape=(x.shape[1],x.shape[1]),\n",
    "                                      dtype=x.dtype)\n",
    "        objs.append(obj)\n",
    "    gram_matrix = dask.array.stack(objs).sum(axis=0).compute()    \n",
    "    mean_x, _ = get_mean_var_dask(client, x)\n",
    "    mean_x = mean_x.astype(x.dtype)\n",
    "    copy_gram = _copy_kernel(x.dtype)\n",
    "    block = (32, 32)\n",
    "    grid = (math.ceil(x.shape[1] / block[0]), math.ceil(x.shape[1] / block[1]))\n",
    "    copy_gram(\n",
    "        grid,\n",
    "        block,\n",
    "        (gram_matrix, x.shape[1]),\n",
    "    )\n",
    "\n",
    "    gram_matrix *= 1 / x.shape[0]\n",
    "\n",
    "    if return_gram:\n",
    "        cov_result = cp.zeros(\n",
    "            (gram_matrix.shape[0], gram_matrix.shape[0]),\n",
    "            dtype=gram_matrix.dtype,\n",
    "        )\n",
    "    else:\n",
    "        cov_result = gram_matrix\n",
    "\n",
    "    compute_cov = _cov_kernel(gram_matrix.dtype)\n",
    "\n",
    "    block_size = (32, 32)\n",
    "    grid_size = (math.ceil(gram_matrix.shape[0] / 8),) * 2\n",
    "    compute_cov(\n",
    "        grid_size,\n",
    "        block_size,\n",
    "        (cov_result, gram_matrix, mean_x, mean_x, gram_matrix.shape[0]),\n",
    "    )\n",
    "\n",
    "    if not return_gram and not return_mean:\n",
    "        return cov_result\n",
    "    elif return_gram and not return_mean:\n",
    "        return cov_result, gram_matrix\n",
    "    elif not return_gram and return_mean:\n",
    "        return cov_result, mean_x, mean_x\n",
    "    elif return_gram and return_mean:\n",
    "        return cov_result, gram_matrix, mean_x, mean_x\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "c5065da6-356d-406d-ba6f-991b42f8dca5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 90.60 GiB </td>\n",
       "                        <td> 5.66 GiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2432024, 5000) </td>\n",
       "                        <td> (152001, 5000) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 17 chunks in 1 graph layer </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float64 cupy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"75\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"25\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"7\" x2=\"25\" y2=\"7\" />\n",
       "  <line x1=\"0\" y1=\"14\" x2=\"25\" y2=\"14\" />\n",
       "  <line x1=\"0\" y1=\"22\" x2=\"25\" y2=\"22\" />\n",
       "  <line x1=\"0\" y1=\"29\" x2=\"25\" y2=\"29\" />\n",
       "  <line x1=\"0\" y1=\"37\" x2=\"25\" y2=\"37\" />\n",
       "  <line x1=\"0\" y1=\"44\" x2=\"25\" y2=\"44\" />\n",
       "  <line x1=\"0\" y1=\"52\" x2=\"25\" y2=\"52\" />\n",
       "  <line x1=\"0\" y1=\"59\" x2=\"25\" y2=\"59\" />\n",
       "  <line x1=\"0\" y1=\"67\" x2=\"25\" y2=\"67\" />\n",
       "  <line x1=\"0\" y1=\"74\" x2=\"25\" y2=\"74\" />\n",
       "  <line x1=\"0\" y1=\"82\" x2=\"25\" y2=\"82\" />\n",
       "  <line x1=\"0\" y1=\"89\" x2=\"25\" y2=\"89\" />\n",
       "  <line x1=\"0\" y1=\"97\" x2=\"25\" y2=\"97\" />\n",
       "  <line x1=\"0\" y1=\"104\" x2=\"25\" y2=\"104\" />\n",
       "  <line x1=\"0\" y1=\"112\" x2=\"25\" y2=\"112\" />\n",
       "  <line x1=\"0\" y1=\"119\" x2=\"25\" y2=\"119\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"25\" y1=\"0\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 25.412616514582485,0.0 25.412616514582485,120.0 0.0,120.0\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"12.706308\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >5000</text>\n",
       "  <text x=\"45.412617\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,45.412617,60.000000)\">2432024</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<rechunk-merge, shape=(2432024, 5000), dtype=float64, chunksize=(152001, 5000), chunktype=cupy.ndarray>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dask_sparse_arr = dask_sparse_arr.astype(cp.float64)\n",
    "n_rows = dask_sparse_arr.shape[0]\n",
    "n_cols = dask_sparse_arr.shape[1]\n",
    "cols_per_worker = int(n_rows / 16)\n",
    "dask_sparse_arr = dask_sparse_arr.rechunk((cols_per_worker, n_cols)).persist()\n",
    "\n",
    "dask_sparse_arr.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b525a19c-6ed6-4e2c-a249-c4fc536862bb",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.11 s, sys: 608 ms, total: 1.72 s\n",
      "Wall time: 10.7 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <td>\n",
       "            <table style=\"border-collapse: collapse;\">\n",
       "                <thead>\n",
       "                    <tr>\n",
       "                        <td> </td>\n",
       "                        <th> Array </th>\n",
       "                        <th> Chunk </th>\n",
       "                    </tr>\n",
       "                </thead>\n",
       "                <tbody>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Bytes </th>\n",
       "                        <td> 1.81 GiB </td>\n",
       "                        <td> 115.97 MiB </td>\n",
       "                    </tr>\n",
       "                    \n",
       "                    <tr>\n",
       "                        <th> Shape </th>\n",
       "                        <td> (2432024, 100) </td>\n",
       "                        <td> (152001, 100) </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Dask graph </th>\n",
       "                        <td colspan=\"2\"> 17 chunks in 1 graph layer </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <th> Data type </th>\n",
       "                        <td colspan=\"2\"> float64 cupy.ndarray </td>\n",
       "                    </tr>\n",
       "                </tbody>\n",
       "            </table>\n",
       "        </td>\n",
       "        <td>\n",
       "        <svg width=\"75\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
       "\n",
       "  <!-- Horizontal lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"25\" y2=\"0\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"0\" y1=\"7\" x2=\"25\" y2=\"7\" />\n",
       "  <line x1=\"0\" y1=\"14\" x2=\"25\" y2=\"14\" />\n",
       "  <line x1=\"0\" y1=\"22\" x2=\"25\" y2=\"22\" />\n",
       "  <line x1=\"0\" y1=\"29\" x2=\"25\" y2=\"29\" />\n",
       "  <line x1=\"0\" y1=\"37\" x2=\"25\" y2=\"37\" />\n",
       "  <line x1=\"0\" y1=\"44\" x2=\"25\" y2=\"44\" />\n",
       "  <line x1=\"0\" y1=\"52\" x2=\"25\" y2=\"52\" />\n",
       "  <line x1=\"0\" y1=\"59\" x2=\"25\" y2=\"59\" />\n",
       "  <line x1=\"0\" y1=\"67\" x2=\"25\" y2=\"67\" />\n",
       "  <line x1=\"0\" y1=\"74\" x2=\"25\" y2=\"74\" />\n",
       "  <line x1=\"0\" y1=\"82\" x2=\"25\" y2=\"82\" />\n",
       "  <line x1=\"0\" y1=\"89\" x2=\"25\" y2=\"89\" />\n",
       "  <line x1=\"0\" y1=\"97\" x2=\"25\" y2=\"97\" />\n",
       "  <line x1=\"0\" y1=\"104\" x2=\"25\" y2=\"104\" />\n",
       "  <line x1=\"0\" y1=\"112\" x2=\"25\" y2=\"112\" />\n",
       "  <line x1=\"0\" y1=\"119\" x2=\"25\" y2=\"119\" />\n",
       "  <line x1=\"0\" y1=\"120\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Vertical lines -->\n",
       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "  <line x1=\"25\" y1=\"0\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
       "\n",
       "  <!-- Colored Rectangle -->\n",
       "  <polygon points=\"0.0,0.0 25.41261651458249,0.0 25.41261651458249,120.0 0.0,120.0\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
       "\n",
       "  <!-- Text -->\n",
       "  <text x=\"12.706308\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >100</text>\n",
       "  <text x=\"45.412617\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,45.412617,60.000000)\">2432024</text>\n",
       "</svg>\n",
       "        </td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "dask.array<_transform, shape=(2432024, 100), dtype=float64, chunksize=(152001, 100), chunktype=cupy.ndarray>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dask_pca =PCA_sparse_dask(n_components=100, client=client, whiten=False)\n",
    "dask_pca.fit(dask_sparse_arr)\n",
    "dask_pca_data = dask_pca.transform(dask_sparse_arr)\n",
    "dask_pca_data.compute_chunk_sizes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "2e9c53db-f805-4d9a-8db9-94d8e188059a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.11 s, sys: 1.47 s, total: 2.58 s\n",
      "Wall time: 2.75 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "pca = dask_pca_data.compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "4c67de58-c3ac-4138-9c9a-5218acfd6a76",
   "metadata": {},
   "outputs": [],
   "source": [
    "dask_sparse_arr_np = dask_sparse_arr.map_blocks(lambda x: x.get(), dtype=cp.float64,meta = np.array((0,)))\n",
    "sparse_scipy_matrix = dask_sparse_arr_np.compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6f7b0471-5fd7-4d08-b372-182d6bc35b72",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<2432024x5000 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 1608641402 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sparse_scipy_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d09450b7-8773-401a-ba36-782de9d320c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "adata = anndata.AnnData(sparse_scipy_matrix)\n",
    "sc.pp.pca(adata, n_comps=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "daf960e9-0cc2-475f-8a6b-8c12bfc852f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "cp.testing.assert_allclose(cp.abs(pca),np.abs(adata.obsm[\"X_pca\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3a9c399-e712-4c1e-ab3b-728c4679ab4b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-23.12",
   "language": "python",
   "name": "rapids-23.12"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}