Skip to content

Instantly share code, notes, and snippets.

@giovp
Created January 24, 2023 09:39
Show Gist options
  • Save giovp/f0ba33c79b9bcadaa07cfef11610f00b to your computer and use it in GitHub Desktop.
Save giovp/f0ba33c79b9bcadaa07cfef11610f00b to your computer and use it in GitHub Desktop.
xarray v. pandas table to represent points
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "538d3b44-0110-4ce6-bb8c-d758ca5437d3",
"metadata": {},
"outputs": [],
"source": [
"import anndata as ad\n",
"from pathlib import Path\n",
"import numpy as np\n",
"import pandas as pd\n",
"import zarr\n",
"import os\n",
"import time\n",
"import xarray as xr\n",
"from dask.array import from_array\n",
"from pandas import read_parquet\n",
"\n",
"# sc.set_figure_params()\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"%load_ext lab_black"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8e84ba60-8392-4b60-b190-21777b26f57f",
"metadata": {},
"outputs": [],
"source": [
"path = Path(\n",
" \"/Users/giovanni.palla/Projects/spatialdata-sandbox/nanostring_cosmx/data/data_lung5_rep2\"\n",
")\n",
"table = pd.read_csv(path / \"Lung5_Rep2_tx_file.csv\", nrows=10_000_000)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "04490b60-2938-4187-8e77-607c67bc36b1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>fov</th>\n",
" <th>cell_ID</th>\n",
" <th>x_global_px</th>\n",
" <th>y_global_px</th>\n",
" <th>x_local_px</th>\n",
" <th>y_local_px</th>\n",
" <th>z</th>\n",
" <th>target</th>\n",
" <th>CellComp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>9565.928500</td>\n",
" <td>161687.193822</td>\n",
" <td>2215.928500</td>\n",
" <td>2514.971600</td>\n",
" <td>0</td>\n",
" <td>NLRP1</td>\n",
" <td>Membrane</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>10786.266600</td>\n",
" <td>160186.444422</td>\n",
" <td>3436.266600</td>\n",
" <td>1014.222200</td>\n",
" <td>0</td>\n",
" <td>MYH11</td>\n",
" <td>Membrane</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8710.160000</td>\n",
" <td>162591.832212</td>\n",
" <td>1360.160000</td>\n",
" <td>3419.609990</td>\n",
" <td>0</td>\n",
" <td>IGHA1</td>\n",
" <td>Membrane</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>7403.842857</td>\n",
" <td>159807.007822</td>\n",
" <td>53.842857</td>\n",
" <td>634.785600</td>\n",
" <td>0</td>\n",
" <td>IL7R</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>10331.200200</td>\n",
" <td>159339.305522</td>\n",
" <td>2981.200200</td>\n",
" <td>167.083300</td>\n",
" <td>0</td>\n",
" <td>SEC61G</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999995</th>\n",
" <td>8</td>\n",
" <td>47</td>\n",
" <td>20752.615831</td>\n",
" <td>166351.865083</td>\n",
" <td>2458.171387</td>\n",
" <td>3529.642860</td>\n",
" <td>2</td>\n",
" <td>DUSP5</td>\n",
" <td>Nuclear</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999996</th>\n",
" <td>8</td>\n",
" <td>47</td>\n",
" <td>20763.644396</td>\n",
" <td>166342.650796</td>\n",
" <td>2469.199951</td>\n",
" <td>3520.428574</td>\n",
" <td>2</td>\n",
" <td>DDC</td>\n",
" <td>Nuclear</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999997</th>\n",
" <td>8</td>\n",
" <td>47</td>\n",
" <td>20732.624376</td>\n",
" <td>166346.122224</td>\n",
" <td>2438.179932</td>\n",
" <td>3523.900002</td>\n",
" <td>2</td>\n",
" <td>KRT13</td>\n",
" <td>Cytoplasm</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999998</th>\n",
" <td>8</td>\n",
" <td>47</td>\n",
" <td>20787.944444</td>\n",
" <td>166403.922219</td>\n",
" <td>2493.500000</td>\n",
" <td>3581.699997</td>\n",
" <td>2</td>\n",
" <td>ADGRB3</td>\n",
" <td>Membrane</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999999</th>\n",
" <td>8</td>\n",
" <td>47</td>\n",
" <td>20791.644396</td>\n",
" <td>166377.147218</td>\n",
" <td>2497.199951</td>\n",
" <td>3554.924995</td>\n",
" <td>2</td>\n",
" <td>MALAT1</td>\n",
" <td>Nuclear</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10000000 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" fov cell_ID x_global_px y_global_px x_local_px y_local_px \\\n",
"0 1 0 9565.928500 161687.193822 2215.928500 2514.971600 \n",
"1 1 0 10786.266600 160186.444422 3436.266600 1014.222200 \n",
"2 1 0 8710.160000 162591.832212 1360.160000 3419.609990 \n",
"3 1 0 7403.842857 159807.007822 53.842857 634.785600 \n",
"4 1 0 10331.200200 159339.305522 2981.200200 167.083300 \n",
"... ... ... ... ... ... ... \n",
"9999995 8 47 20752.615831 166351.865083 2458.171387 3529.642860 \n",
"9999996 8 47 20763.644396 166342.650796 2469.199951 3520.428574 \n",
"9999997 8 47 20732.624376 166346.122224 2438.179932 3523.900002 \n",
"9999998 8 47 20787.944444 166403.922219 2493.500000 3581.699997 \n",
"9999999 8 47 20791.644396 166377.147218 2497.199951 3554.924995 \n",
"\n",
" z target CellComp \n",
"0 0 NLRP1 Membrane \n",
"1 0 MYH11 Membrane \n",
"2 0 IGHA1 Membrane \n",
"3 0 IL7R 0 \n",
"4 0 SEC61G 0 \n",
"... .. ... ... \n",
"9999995 2 DUSP5 Nuclear \n",
"9999996 2 DDC Nuclear \n",
"9999997 2 KRT13 Cytoplasm \n",
"9999998 2 ADGRB3 Membrane \n",
"9999999 2 MALAT1 Nuclear \n",
"\n",
"[10000000 rows x 9 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "728f11f7-6676-4741-835b-05536c4da5fd",
"metadata": {},
"outputs": [],
"source": [
"# use categorical when possible\n",
"table[\"CellComp\"] = pd.Categorical(table[\"CellComp\"].astype(str))\n",
"table[\"target\"] = pd.Categorical(table[\"target\"].astype(str))\n",
"table[\"fov\"] = pd.Categorical(table[\"fov\"].astype(str))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "edf41825-0c52-4c96-a728-fe8d8d58256d",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5189270973205566\n"
]
}
],
"source": [
"# initialize xarray dataset\n",
"start = time.time()\n",
"ds = xr.Dataset(\n",
" coords={\n",
" k: v\n",
" for k, v in zip(\n",
" (\"x\", \"y\", \"z\"),\n",
" table[[\"x_local_px\", \"y_local_px\", \"z\"]].to_numpy(),\n",
" )\n",
" },\n",
" data_vars={\n",
" \"target\": xr.Variable(dims=\"index\", data=table[\"target\"]),\n",
" \"fov\": xr.Variable(dims=\"index\", data=table[\"fov\"]),\n",
" \"CellComp\": xr.Variable(dims=\"index\", data=table[\"CellComp\"]),\n",
" \"cell_ID\": xr.Variable(dims=\"index\", data=table[\"cell_ID\"]),\n",
" },\n",
")\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b3883a46-4601-453b-95e3-b5b6c3455344",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.762892007827759\n"
]
}
],
"source": [
"# write to disk w/ zarr\n",
"start = time.time()\n",
"ds.to_zarr(\"./transcripts.zarr\")\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e5c060ae-4926-45d3-bd0c-ec5cb8c7b1a2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.2495009899139404\n"
]
}
],
"source": [
"# read from disk w/ zarr\n",
"start = time.time()\n",
"ds_new = xr.open_zarr(\"./transcripts.zarr\")\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bdd22a50-8059-4ca2-b989-e4e24eff164c",
"metadata": {},
"outputs": [],
"source": [
"# copy relevant columns\n",
"table_pq = table[\n",
" [\"fov\", \"cell_ID\", \"x_local_px\", \"y_local_px\", \"z\", \"target\", \"CellComp\"]\n",
"].copy()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "21e20a80-916d-49b5-a2ee-eaeead44fbf7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.457585096359253\n"
]
}
],
"source": [
"# write to disk w/ parquet\n",
"start = time.time()\n",
"table_pq.to_parquet(\"./transcripts.parquet\")\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "32b4c761-7e22-4235-8388-d7033dfcc672",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5914890766143799\n"
]
}
],
"source": [
"# read from disk w/ parquet\n",
"start = time.time()\n",
"table_new = read_parquet(\"./transcripts.parquet\")\n",
"end = time.time()\n",
"print(end - start)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment