Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Get taxonomic distribution of interpro domains
import pandas as pd
from biomart import BiomartServer, BiomartDataset
from Bio import Entrez
def get_tax_id(specie):
"""Get taxon ID for specie."""
specie = specie.replace(" ", "+").strip()
search = Entrez.esearch(term=specie, db="taxonomy", retmode="xml")
record =
if int(record["Count"]) == 0:
return None
if "IdList" in record.keys():
return record['IdList'][0]
def get_tax_data(taxid):
"""Fetch the record of a taxon ID."""
search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
# query interpro domains for the prots
ids = ["P01106", "P17947"] # some examples
server = BiomartServer("")
uniprot = server.datasets['uniprot']
attributes = ['accession', 'ensembl_id', 'entry_type', 'gene_name', 'name', 'interpro_id']
response ={
'filters': {'accession': ids},
'attributes': attributes
df = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
# Query taxonomies with domains
domains = df['interpro_id']
attributes = ['entry_id', 'entry_type', 'entry_name', 'taxonomy_scientific_name']
interpro = BiomartDataset("", name='entry')
for domain in domains:
response ={
'filters': {'entry_id': domain},
'attributes': attributes
taxons = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
if domain == domains[0]:
df = taxons
df = pd.concat([df, taxons])
# get full taxonomy lineage of species = "" # enter your email here
df['taxonomy'] = None # non matched species will have None in the end
for specie in unique(df["taxonomy_scientific_name"]):
taxid = get_tax_id(specie)
if taxid is None:
data = get_tax_data(taxid)
if len(data) >= 1:
if "Lineage" in data[0].keys():
# add taxonomy to all rows with this specie name
df['taxonomy'][df['taxonomy_scientific_name'] == specie] = data[0]["Lineage"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment