Last active
February 6, 2022 14:56
-
-
Save mr-eyes/92d6172c7a5c7d5bd35fcff6f765d48d to your computer and use it in GitHub Desktop.
Using the new NCBI's datasets API to get the nearest available reference genomes for a given taxon or organism name.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Using the new NCBI's datasets API to get the nearest available reference genomes for a given taxon or organism name. | |
Input: TAX_ID or Name | |
Output: | |
1- The nearest organism with available reference genomes | |
2- Accessions of the reference genomes | |
Requirements: | |
pip install --upgrade ncbi-datasets-pylib | |
""" | |
import sys | |
import ncbi.datasets.openapi | |
from ncbi.datasets.openapi.api import taxonomy_api | |
from ncbi.datasets.openapi.model.v1_taxonomy_metadata_request import V1TaxonomyMetadataRequest | |
from ncbi.datasets.openapi.model.rpc_status import RpcStatus | |
from ncbi.datasets.openapi.model.v1_taxonomy_metadata_request_content_type import V1TaxonomyMetadataRequestContentType | |
from ncbi.datasets.openapi.model.v1_taxonomy_metadata_response import V1TaxonomyMetadataResponse | |
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient | |
from ncbi.datasets import GenomeApi | |
import pprint | |
configuration = ncbi.datasets.openapi.Configuration( | |
host="https://api.ncbi.nlm.nih.gov/datasets/v1" | |
) | |
# Optional API KEY | |
# configuration.api_key_prefix['ApiKeyAuthHeader'] = '' | |
def nearest_available_taxon(lineage: list): | |
with DatasetsApiClient() as api_client: | |
api = GenomeApi(api_client) | |
taxon = str(lineage.pop()) | |
response = api.assembly_descriptors_by_taxon( | |
taxon=taxon, | |
async_req=True, | |
filters_reference_only = True, | |
) | |
result = response.get().to_dict() | |
if "total_count" in result: | |
accessions = [assembly["assembly"]["assembly_accession"] for assembly in result["assemblies"]] | |
return (taxon, accessions) | |
else: | |
return nearest_available_taxon(lineage) | |
def fetch_tax_info(tax_or_name): | |
with ncbi.datasets.openapi.ApiClient(configuration) as api_client: | |
api_instance = taxonomy_api.TaxonomyApi(api_client) | |
v1_taxonomy_metadata_request = V1TaxonomyMetadataRequest( | |
taxons=[str(tax_or_name)], | |
returned_content=V1TaxonomyMetadataRequestContentType("COMPLETE"), | |
) | |
try: | |
# Use taxonomic identifiers to get taxonomic metadata by post | |
api_response = api_instance.taxonomy_metadata_post(v1_taxonomy_metadata_request) | |
return api_response.to_dict() | |
except ncbi.datasets.openapi.ApiException as e: | |
print(f"Exception when calling TaxonomyApi->taxonomy_metadata_post: {e}") | |
if __name__ == "__main__": | |
organism_tax_or_name = "Aleurina" | |
info = fetch_tax_info(organism_tax_or_name)["taxonomy_nodes"][0]["taxonomy"] | |
organism_name = info["organism_name"] | |
organism_taxon = info["tax_id"] | |
children = info["children"] | |
full_lineage = info["lineage"] + [organism_taxon] | |
nearest_taxon, accessions = nearest_available_taxon(full_lineage) | |
nearest_organism_name = fetch_tax_info(nearest_taxon)["taxonomy_nodes"][0]["taxonomy"]["organism_name"] | |
print(f"query: organism({organism_name}) taxon({organism_taxon})") | |
print(f"nearest_available: organism({nearest_organism_name}) taxon({nearest_taxon})") | |
print(f"found {len(accessions)} accessions.") | |
print('-' * 20) | |
pprint.pprint(accessions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment