Skip to content

Instantly share code, notes, and snippets.

@ivirshup
Last active October 5, 2023 15:35
Show Gist options
  • Save ivirshup/f1a1603db69de3888eacb4bdb6a9317a to your computer and use it in GitHub Desktop.
Save ivirshup/f1a1603db69de3888eacb4bdb6a9317a to your computer and use it in GitHub Desktop.
Downloading an h5ad file from cellxgene API
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "59acb317",
"metadata": {},
"source": [
"It would be nice to have programatic access to the cellxgene data portal, similar to bioconductor's `cellxgenedp` ([repo](https://github.com/mtmorgan/cellxgenedp), [bioconductor page](https://bioconductor.org/packages/release/bioc/html/cellxgenedp.html))\n",
"\n",
"This would be very useful for collecting many AnnData files with standardized metadata.\n",
"\n",
"I would build this with [`pooch`](https://www.fatiando.org/pooch/latest/), using the scverse cookie cutter template.\n",
"\n",
"# API info\n",
"\n",
"* openapi schema: `https://api.cellxgene.cziscience.com/dp/openapi.json`\n",
" * viewer: https://editor.swagger.io\n",
"\n",
"# Working example"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "139ae6e1",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"from jsonpath_ng import jsonpath, parse\n",
"import pandas as pd\n",
"import pooch\n",
"\n",
"import anndata as ad\n",
"\n",
"CELLXGENE_PRODUCTION_HOST = \"api.cellxgene.cziscience.com\"\n",
"CELLXGENE_PRODUCTION_ENDPOINT = f\"https://{CELLXGENE_PRODUCTION_HOST}\"\n",
"DATASETS = f\"{CELLXGENE_PRODUCTION_ENDPOINT}/dp/v1/datasets/\"\n",
"COLLECTIONS = f\"{CELLXGENE_PRODUCTION_ENDPOINT}/dp/v1/collections/\"\n",
"CELLXGENE_EXPLORER = \"https://cellxgene.cziscience.com/e/\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f46c0fd7",
"metadata": {},
"outputs": [],
"source": [
"def presign_url(url):\n",
" resp = requests.post(url)\n",
" return resp.json()[\"presigned_url\"]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "267e3ab5",
"metadata": {},
"outputs": [],
"source": [
"def download_file(\n",
" dataset_id,\n",
" file_id,\n",
" file_type,\n",
" base_url=DATASETS,\n",
"# dry_run=True, cache_path=\"./data\"\n",
"):\n",
" url = f\"{base_url}{dataset_id}/asset/{file_id}\"\n",
" download_url = presign_url(url)\n",
" pth = pooch.retrieve(\n",
" url=download_url,\n",
" known_hash=None,\n",
" fname=f\"{dataset_id}.h5ad\"\n",
" )\n",
" return pth"
]
},
{
"cell_type": "markdown",
"id": "a840620c",
"metadata": {},
"source": [
"# Get collections"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b24a684f",
"metadata": {},
"outputs": [],
"source": [
"collections_json = requests.get(COLLECTIONS).json()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "43536282",
"metadata": {},
"outputs": [],
"source": [
"db_tbl = pd.DataFrame.from_records(collections_json[\"collections\"])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c257d65e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>created_at</th>\n",
" <th>id</th>\n",
" <th>visibility</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.648031e+09</td>\n",
" <td>03f821b4-87be-4ff4-b65a-b5fc00061da7</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.620067e+09</td>\n",
" <td>6e8c5415-302c-492a-a5f9-f29c57ff18fb</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.648714e+09</td>\n",
" <td>3472f32d-4a33-48e2-aad5-666d4631bf4c</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.636740e+09</td>\n",
" <td>83ed3be8-4cb9-43e6-9aaa-3fbbf5d1bd3a</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.643074e+09</td>\n",
" <td>92fde064-2fb4-41f8-b85c-c6904000b859</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>1.615241e+09</td>\n",
" <td>f70ebd97-b3bc-44fe-849d-c18e08fe773d</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>1.629127e+09</td>\n",
" <td>4f586cb6-972b-4ef7-a4ef-3c3800a3c004</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>1.626806e+09</td>\n",
" <td>fcb3d1c1-03d2-41ac-8229-458e072b7a1c</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>1.619570e+09</td>\n",
" <td>20a1dadf-a3a7-4783-b311-fcff3c457763</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>1.624983e+09</td>\n",
" <td>e2a4a67f-6a18-431a-ab9c-6e77dd31cc80</td>\n",
" <td>PUBLIC</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>101 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" created_at id visibility\n",
"0 1.648031e+09 03f821b4-87be-4ff4-b65a-b5fc00061da7 PUBLIC\n",
"1 1.620067e+09 6e8c5415-302c-492a-a5f9-f29c57ff18fb PUBLIC\n",
"2 1.648714e+09 3472f32d-4a33-48e2-aad5-666d4631bf4c PUBLIC\n",
"3 1.636740e+09 83ed3be8-4cb9-43e6-9aaa-3fbbf5d1bd3a PUBLIC\n",
"4 1.643074e+09 92fde064-2fb4-41f8-b85c-c6904000b859 PUBLIC\n",
".. ... ... ...\n",
"96 1.615241e+09 f70ebd97-b3bc-44fe-849d-c18e08fe773d PUBLIC\n",
"97 1.629127e+09 4f586cb6-972b-4ef7-a4ef-3c3800a3c004 PUBLIC\n",
"98 1.626806e+09 fcb3d1c1-03d2-41ac-8229-458e072b7a1c PUBLIC\n",
"99 1.619570e+09 20a1dadf-a3a7-4783-b311-fcff3c457763 PUBLIC\n",
"100 1.624983e+09 e2a4a67f-6a18-431a-ab9c-6e77dd31cc80 PUBLIC\n",
"\n",
"[101 rows x 3 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db_tbl"
]
},
{
"cell_type": "markdown",
"id": "2139fdd8",
"metadata": {},
"source": [
"# Try a dataset"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6cdf575a",
"metadata": {},
"outputs": [],
"source": [
"rec_resp = requests.get(COLLECTIONS + \"6e8c5415-302c-492a-a5f9-f29c57ff18fb\")"
]
},
{
"cell_type": "markdown",
"id": "ea88b4c6",
"metadata": {},
"source": [
"cellxgenedp is grabbing this json for every item in the collection, then running jsonpath queries against them. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6b6b713c",
"metadata": {},
"outputs": [],
"source": [
"rec = rec_resp.json()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a17e2698",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'created_at': 1661384287.008212,\n",
" 'dataset_id': 'b07e5164-baf6-43d2-bdba-5a249d0da879',\n",
" 'filename': 'local.h5ad',\n",
" 'filetype': 'H5AD',\n",
" 'id': 'fce7ca81-6d88-45ab-aece-0e6a65942bbc',\n",
" 's3_uri': 's3://corpora-data-prod/e2a38bff-d293-4c9d-90ef-e9018d16775c/local.h5ad',\n",
" 'updated_at': 1661795681.675807,\n",
" 'user_submitted': True}]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"collection_assets = [x.value for x in parse(\"datasets[*].dataset_assets[*]\").find(rec) if x.value[\"filetype\"] == \"H5AD\"]\n",
"collection_assets"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "14752dd5",
"metadata": {},
"outputs": [],
"source": [
"asset = collection_assets[0]\n",
"path = download_file(\n",
" dataset_id=asset[\"dataset_id\"],\n",
" file_id=asset[\"id\"],\n",
" file_type=asset[\"filetype\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8fa1e9ef",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 2126 × 15670\n",
" obs: 'cellular_classification', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'donor_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'\n",
" var: 'feature_is_filtered', 'feature_biotype', 'feature_name', 'feature_reference'\n",
" uns: 'X_normalization', 'layer_descriptions', 'publication_doi', 'schema_version', 'title'\n",
" obsm: 'X_tsne'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adata = ad.read_h5ad(path)\n",
"adata"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3.bak"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@flying-sheep
Copy link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment