Get taxonomic distribution of interpro domains
import pandas as pd
from biomart import BiomartServer, BiomartDataset
from Bio import Entrez
def get_tax_id(specie):
"""Get taxon ID for specie."""
specie = specie.replace(" ", "+").strip()
search = Entrez.esearch(term=specie, db="taxonomy", retmode="xml")
record =
if int(record["Count"]) == 0:
return None
if "IdList" in record.keys():
return record['IdList'][0]
def get_tax_data(taxid):
"""Fetch the record of a taxon ID."""
search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
# query interpro domains for the prots
ids = ["P01106", "P17947"] # some examples
server = BiomartServer("")
uniprot = server.datasets['uniprot']
attributes = ['accession', 'ensembl_id', 'entry_type', 'gene_name', 'name', 'interpro_id']
response ={
'filters': {'accession': ids},
'attributes': attributes
df = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
# Query taxonomies with domains
domains = df['interpro_id']
attributes = ['entry_id', 'entry_type', 'entry_name', 'taxonomy_scientific_name']
interpro = BiomartDataset("", name='entry')
for domain in domains:
response ={
'filters': {'entry_id': domain},
'attributes': attributes
taxons = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
if domain == domains[0]:
df = taxons
df = pd.concat([df, taxons])
# get full taxonomy lineage of species = "" # enter your email here
df['taxonomy'] = None # non matched species will have None in the end
for specie in unique(df["taxonomy_scientific_name"]):
taxid = get_tax_id(specie)
if taxid is None:
data = get_tax_data(taxid)
if len(data) >= 1:
if "Lineage" in data[0].keys():
# add taxonomy to all rows with this specie name
df['taxonomy'][df['taxonomy_scientific_name'] == specie] = data[0]["Lineage"]
