Created
January 24, 2023 09:39
-
-
Save giovp/f0ba33c79b9bcadaa07cfef11610f00b to your computer and use it in GitHub Desktop.
xarray v. pandas table to represent points
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "538d3b44-0110-4ce6-bb8c-d758ca5437d3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import anndata as ad\n", | |
"from pathlib import Path\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import zarr\n", | |
"import os\n", | |
"import time\n", | |
"import xarray as xr\n", | |
"from dask.array import from_array\n", | |
"from pandas import read_parquet\n", | |
"\n", | |
"# sc.set_figure_params()\n", | |
"%load_ext autoreload\n", | |
"%autoreload 2\n", | |
"%load_ext lab_black" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "8e84ba60-8392-4b60-b190-21777b26f57f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"path = Path(\n", | |
" \"/Users/giovanni.palla/Projects/spatialdata-sandbox/nanostring_cosmx/data/data_lung5_rep2\"\n", | |
")\n", | |
"table = pd.read_csv(path / \"Lung5_Rep2_tx_file.csv\", nrows=10_000_000)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "04490b60-2938-4187-8e77-607c67bc36b1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>fov</th>\n", | |
" <th>cell_ID</th>\n", | |
" <th>x_global_px</th>\n", | |
" <th>y_global_px</th>\n", | |
" <th>x_local_px</th>\n", | |
" <th>y_local_px</th>\n", | |
" <th>z</th>\n", | |
" <th>target</th>\n", | |
" <th>CellComp</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>9565.928500</td>\n", | |
" <td>161687.193822</td>\n", | |
" <td>2215.928500</td>\n", | |
" <td>2514.971600</td>\n", | |
" <td>0</td>\n", | |
" <td>NLRP1</td>\n", | |
" <td>Membrane</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>10786.266600</td>\n", | |
" <td>160186.444422</td>\n", | |
" <td>3436.266600</td>\n", | |
" <td>1014.222200</td>\n", | |
" <td>0</td>\n", | |
" <td>MYH11</td>\n", | |
" <td>Membrane</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>8710.160000</td>\n", | |
" <td>162591.832212</td>\n", | |
" <td>1360.160000</td>\n", | |
" <td>3419.609990</td>\n", | |
" <td>0</td>\n", | |
" <td>IGHA1</td>\n", | |
" <td>Membrane</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>7403.842857</td>\n", | |
" <td>159807.007822</td>\n", | |
" <td>53.842857</td>\n", | |
" <td>634.785600</td>\n", | |
" <td>0</td>\n", | |
" <td>IL7R</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>10331.200200</td>\n", | |
" <td>159339.305522</td>\n", | |
" <td>2981.200200</td>\n", | |
" <td>167.083300</td>\n", | |
" <td>0</td>\n", | |
" <td>SEC61G</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9999995</th>\n", | |
" <td>8</td>\n", | |
" <td>47</td>\n", | |
" <td>20752.615831</td>\n", | |
" <td>166351.865083</td>\n", | |
" <td>2458.171387</td>\n", | |
" <td>3529.642860</td>\n", | |
" <td>2</td>\n", | |
" <td>DUSP5</td>\n", | |
" <td>Nuclear</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9999996</th>\n", | |
" <td>8</td>\n", | |
" <td>47</td>\n", | |
" <td>20763.644396</td>\n", | |
" <td>166342.650796</td>\n", | |
" <td>2469.199951</td>\n", | |
" <td>3520.428574</td>\n", | |
" <td>2</td>\n", | |
" <td>DDC</td>\n", | |
" <td>Nuclear</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9999997</th>\n", | |
" <td>8</td>\n", | |
" <td>47</td>\n", | |
" <td>20732.624376</td>\n", | |
" <td>166346.122224</td>\n", | |
" <td>2438.179932</td>\n", | |
" <td>3523.900002</td>\n", | |
" <td>2</td>\n", | |
" <td>KRT13</td>\n", | |
" <td>Cytoplasm</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9999998</th>\n", | |
" <td>8</td>\n", | |
" <td>47</td>\n", | |
" <td>20787.944444</td>\n", | |
" <td>166403.922219</td>\n", | |
" <td>2493.500000</td>\n", | |
" <td>3581.699997</td>\n", | |
" <td>2</td>\n", | |
" <td>ADGRB3</td>\n", | |
" <td>Membrane</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9999999</th>\n", | |
" <td>8</td>\n", | |
" <td>47</td>\n", | |
" <td>20791.644396</td>\n", | |
" <td>166377.147218</td>\n", | |
" <td>2497.199951</td>\n", | |
" <td>3554.924995</td>\n", | |
" <td>2</td>\n", | |
" <td>MALAT1</td>\n", | |
" <td>Nuclear</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>10000000 rows × 9 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" fov cell_ID x_global_px y_global_px x_local_px y_local_px \\\n", | |
"0 1 0 9565.928500 161687.193822 2215.928500 2514.971600 \n", | |
"1 1 0 10786.266600 160186.444422 3436.266600 1014.222200 \n", | |
"2 1 0 8710.160000 162591.832212 1360.160000 3419.609990 \n", | |
"3 1 0 7403.842857 159807.007822 53.842857 634.785600 \n", | |
"4 1 0 10331.200200 159339.305522 2981.200200 167.083300 \n", | |
"... ... ... ... ... ... ... \n", | |
"9999995 8 47 20752.615831 166351.865083 2458.171387 3529.642860 \n", | |
"9999996 8 47 20763.644396 166342.650796 2469.199951 3520.428574 \n", | |
"9999997 8 47 20732.624376 166346.122224 2438.179932 3523.900002 \n", | |
"9999998 8 47 20787.944444 166403.922219 2493.500000 3581.699997 \n", | |
"9999999 8 47 20791.644396 166377.147218 2497.199951 3554.924995 \n", | |
"\n", | |
" z target CellComp \n", | |
"0 0 NLRP1 Membrane \n", | |
"1 0 MYH11 Membrane \n", | |
"2 0 IGHA1 Membrane \n", | |
"3 0 IL7R 0 \n", | |
"4 0 SEC61G 0 \n", | |
"... .. ... ... \n", | |
"9999995 2 DUSP5 Nuclear \n", | |
"9999996 2 DDC Nuclear \n", | |
"9999997 2 KRT13 Cytoplasm \n", | |
"9999998 2 ADGRB3 Membrane \n", | |
"9999999 2 MALAT1 Nuclear \n", | |
"\n", | |
"[10000000 rows x 9 columns]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "728f11f7-6676-4741-835b-05536c4da5fd", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# use categorical when possible\n", | |
"table[\"CellComp\"] = pd.Categorical(table[\"CellComp\"].astype(str))\n", | |
"table[\"target\"] = pd.Categorical(table[\"target\"].astype(str))\n", | |
"table[\"fov\"] = pd.Categorical(table[\"fov\"].astype(str))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "edf41825-0c52-4c96-a728-fe8d8d58256d", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.5189270973205566\n" | |
] | |
} | |
], | |
"source": [ | |
"# initialize xarray dataset\n", | |
"start = time.time()\n", | |
"ds = xr.Dataset(\n", | |
" coords={\n", | |
" k: v\n", | |
" for k, v in zip(\n", | |
" (\"x\", \"y\", \"z\"),\n", | |
" table[[\"x_local_px\", \"y_local_px\", \"z\"]].to_numpy(),\n", | |
" )\n", | |
" },\n", | |
" data_vars={\n", | |
" \"target\": xr.Variable(dims=\"index\", data=table[\"target\"]),\n", | |
" \"fov\": xr.Variable(dims=\"index\", data=table[\"fov\"]),\n", | |
" \"CellComp\": xr.Variable(dims=\"index\", data=table[\"CellComp\"]),\n", | |
" \"cell_ID\": xr.Variable(dims=\"index\", data=table[\"cell_ID\"]),\n", | |
" },\n", | |
")\n", | |
"end = time.time()\n", | |
"print(end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "b3883a46-4601-453b-95e3-b5b6c3455344", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.762892007827759\n" | |
] | |
} | |
], | |
"source": [ | |
"# write to disk w/ zarr\n", | |
"start = time.time()\n", | |
"ds.to_zarr(\"./transcripts.zarr\")\n", | |
"end = time.time()\n", | |
"print(end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "e5c060ae-4926-45d3-bd0c-ec5cb8c7b1a2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.2495009899139404\n" | |
] | |
} | |
], | |
"source": [ | |
"# read from disk w/ zarr\n", | |
"start = time.time()\n", | |
"ds_new = xr.open_zarr(\"./transcripts.zarr\")\n", | |
"end = time.time()\n", | |
"print(end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "bdd22a50-8059-4ca2-b989-e4e24eff164c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# copy relevant columns\n", | |
"table_pq = table[\n", | |
" [\"fov\", \"cell_ID\", \"x_local_px\", \"y_local_px\", \"z\", \"target\", \"CellComp\"]\n", | |
"].copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "21e20a80-916d-49b5-a2ee-eaeead44fbf7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.457585096359253\n" | |
] | |
} | |
], | |
"source": [ | |
"# write to disk w/ parquet\n", | |
"start = time.time()\n", | |
"table_pq.to_parquet(\"./transcripts.parquet\")\n", | |
"end = time.time()\n", | |
"print(end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "32b4c761-7e22-4235-8388-d7033dfcc672", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.5914890766143799\n" | |
] | |
} | |
], | |
"source": [ | |
"# read from disk w/ parquet\n", | |
"start = time.time()\n", | |
"table_new = read_parquet(\"./transcripts.parquet\")\n", | |
"end = time.time()\n", | |
"print(end - start)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment