Last active
October 5, 2023 15:35
-
-
Save ivirshup/f1a1603db69de3888eacb4bdb6a9317a to your computer and use it in GitHub Desktop.
Downloading an h5ad file from cellxgene API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "59acb317", | |
"metadata": {}, | |
"source": [ | |
"It would be nice to have programatic access to the cellxgene data portal, similar to bioconductor's `cellxgenedp` ([repo](https://github.com/mtmorgan/cellxgenedp), [bioconductor page](https://bioconductor.org/packages/release/bioc/html/cellxgenedp.html))\n", | |
"\n", | |
"This would be very useful for collecting many AnnData files with standardized metadata.\n", | |
"\n", | |
"I would build this with [`pooch`](https://www.fatiando.org/pooch/latest/), using the scverse cookie cutter template.\n", | |
"\n", | |
"# API info\n", | |
"\n", | |
"* openapi schema: `https://api.cellxgene.cziscience.com/dp/openapi.json`\n", | |
" * viewer: https://editor.swagger.io\n", | |
"\n", | |
"# Working example" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "139ae6e1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import json\n", | |
"from jsonpath_ng import jsonpath, parse\n", | |
"import pandas as pd\n", | |
"import pooch\n", | |
"\n", | |
"import anndata as ad\n", | |
"\n", | |
"CELLXGENE_PRODUCTION_HOST = \"api.cellxgene.cziscience.com\"\n", | |
"CELLXGENE_PRODUCTION_ENDPOINT = f\"https://{CELLXGENE_PRODUCTION_HOST}\"\n", | |
"DATASETS = f\"{CELLXGENE_PRODUCTION_ENDPOINT}/dp/v1/datasets/\"\n", | |
"COLLECTIONS = f\"{CELLXGENE_PRODUCTION_ENDPOINT}/dp/v1/collections/\"\n", | |
"CELLXGENE_EXPLORER = \"https://cellxgene.cziscience.com/e/\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "f46c0fd7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def presign_url(url):\n", | |
" resp = requests.post(url)\n", | |
" return resp.json()[\"presigned_url\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "267e3ab5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def download_file(\n", | |
" dataset_id,\n", | |
" file_id,\n", | |
" file_type,\n", | |
" base_url=DATASETS,\n", | |
"# dry_run=True, cache_path=\"./data\"\n", | |
"):\n", | |
" url = f\"{base_url}{dataset_id}/asset/{file_id}\"\n", | |
" download_url = presign_url(url)\n", | |
" pth = pooch.retrieve(\n", | |
" url=download_url,\n", | |
" known_hash=None,\n", | |
" fname=f\"{dataset_id}.h5ad\"\n", | |
" )\n", | |
" return pth" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a840620c", | |
"metadata": {}, | |
"source": [ | |
"# Get collections" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "b24a684f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"collections_json = requests.get(COLLECTIONS).json()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "43536282", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"db_tbl = pd.DataFrame.from_records(collections_json[\"collections\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "c257d65e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>created_at</th>\n", | |
" <th>id</th>\n", | |
" <th>visibility</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1.648031e+09</td>\n", | |
" <td>03f821b4-87be-4ff4-b65a-b5fc00061da7</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1.620067e+09</td>\n", | |
" <td>6e8c5415-302c-492a-a5f9-f29c57ff18fb</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1.648714e+09</td>\n", | |
" <td>3472f32d-4a33-48e2-aad5-666d4631bf4c</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1.636740e+09</td>\n", | |
" <td>83ed3be8-4cb9-43e6-9aaa-3fbbf5d1bd3a</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1.643074e+09</td>\n", | |
" <td>92fde064-2fb4-41f8-b85c-c6904000b859</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>96</th>\n", | |
" <td>1.615241e+09</td>\n", | |
" <td>f70ebd97-b3bc-44fe-849d-c18e08fe773d</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>97</th>\n", | |
" <td>1.629127e+09</td>\n", | |
" <td>4f586cb6-972b-4ef7-a4ef-3c3800a3c004</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>98</th>\n", | |
" <td>1.626806e+09</td>\n", | |
" <td>fcb3d1c1-03d2-41ac-8229-458e072b7a1c</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99</th>\n", | |
" <td>1.619570e+09</td>\n", | |
" <td>20a1dadf-a3a7-4783-b311-fcff3c457763</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>100</th>\n", | |
" <td>1.624983e+09</td>\n", | |
" <td>e2a4a67f-6a18-431a-ab9c-6e77dd31cc80</td>\n", | |
" <td>PUBLIC</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>101 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" created_at id visibility\n", | |
"0 1.648031e+09 03f821b4-87be-4ff4-b65a-b5fc00061da7 PUBLIC\n", | |
"1 1.620067e+09 6e8c5415-302c-492a-a5f9-f29c57ff18fb PUBLIC\n", | |
"2 1.648714e+09 3472f32d-4a33-48e2-aad5-666d4631bf4c PUBLIC\n", | |
"3 1.636740e+09 83ed3be8-4cb9-43e6-9aaa-3fbbf5d1bd3a PUBLIC\n", | |
"4 1.643074e+09 92fde064-2fb4-41f8-b85c-c6904000b859 PUBLIC\n", | |
".. ... ... ...\n", | |
"96 1.615241e+09 f70ebd97-b3bc-44fe-849d-c18e08fe773d PUBLIC\n", | |
"97 1.629127e+09 4f586cb6-972b-4ef7-a4ef-3c3800a3c004 PUBLIC\n", | |
"98 1.626806e+09 fcb3d1c1-03d2-41ac-8229-458e072b7a1c PUBLIC\n", | |
"99 1.619570e+09 20a1dadf-a3a7-4783-b311-fcff3c457763 PUBLIC\n", | |
"100 1.624983e+09 e2a4a67f-6a18-431a-ab9c-6e77dd31cc80 PUBLIC\n", | |
"\n", | |
"[101 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"db_tbl" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "2139fdd8", | |
"metadata": {}, | |
"source": [ | |
"# Try a dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "6cdf575a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rec_resp = requests.get(COLLECTIONS + \"6e8c5415-302c-492a-a5f9-f29c57ff18fb\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ea88b4c6", | |
"metadata": {}, | |
"source": [ | |
"cellxgenedp is grabbing this json for every item in the collection, then running jsonpath queries against them. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "6b6b713c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rec = rec_resp.json()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "a17e2698", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[{'created_at': 1661384287.008212,\n", | |
" 'dataset_id': 'b07e5164-baf6-43d2-bdba-5a249d0da879',\n", | |
" 'filename': 'local.h5ad',\n", | |
" 'filetype': 'H5AD',\n", | |
" 'id': 'fce7ca81-6d88-45ab-aece-0e6a65942bbc',\n", | |
" 's3_uri': 's3://corpora-data-prod/e2a38bff-d293-4c9d-90ef-e9018d16775c/local.h5ad',\n", | |
" 'updated_at': 1661795681.675807,\n", | |
" 'user_submitted': True}]" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"collection_assets = [x.value for x in parse(\"datasets[*].dataset_assets[*]\").find(rec) if x.value[\"filetype\"] == \"H5AD\"]\n", | |
"collection_assets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "14752dd5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"asset = collection_assets[0]\n", | |
"path = download_file(\n", | |
" dataset_id=asset[\"dataset_id\"],\n", | |
" file_id=asset[\"id\"],\n", | |
" file_type=asset[\"filetype\"]\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "8fa1e9ef", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"AnnData object with n_obs × n_vars = 2126 × 15670\n", | |
" obs: 'cellular_classification', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'donor_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'\n", | |
" var: 'feature_is_filtered', 'feature_biotype', 'feature_name', 'feature_reference'\n", | |
" uns: 'X_normalization', 'layer_descriptions', 'publication_doi', 'schema_version', 'title'\n", | |
" obsm: 'X_tsne'" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"adata = ad.read_h5ad(path)\n", | |
"adata" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3.bak" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Swagger UI is now at https://api.cellxgene.cziscience.com/curation/ui/#/