Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Get taxonomic distribution of interpro domains
import pandas as pd
from biomart import BiomartServer, BiomartDataset
from Bio import Entrez
def get_tax_id(specie):
"""Get taxon ID for specie."""
specie = specie.replace(" ", "+").strip()
search = Entrez.esearch(term=specie, db="taxonomy", retmode="xml")
record = Entrez.read(search)
if int(record["Count"]) == 0:
return None
if "IdList" in record.keys():
return record['IdList'][0]
def get_tax_data(taxid):
"""Fetch the record of a taxon ID."""
search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
return Entrez.read(search)
# query interpro domains for the prots
ids = ["P01106", "P17947"] # some examples
server = BiomartServer("http://www.biomart.org/biomart")
uniprot = server.datasets['uniprot']
attributes = ['accession', 'ensembl_id', 'entry_type', 'gene_name', 'name', 'interpro_id']
response = uniprot.search({
'filters': {'accession': ids},
'attributes': attributes
})
df = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
columns=attributes
)
# Query taxonomies with domains
domains = df['interpro_id']
attributes = ['entry_id', 'entry_type', 'entry_name', 'taxonomy_scientific_name']
interpro = BiomartDataset("http://www.biomart.org/biomart", name='entry')
for domain in domains:
response = interpro.search({
'filters': {'entry_id': domain},
'attributes': attributes
})
taxons = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
columns=attributes
)
if domain == domains[0]:
df = taxons
else:
df = pd.concat([df, taxons])
# get full taxonomy lineage of species
Entrez.email = "" # enter your email here
df['taxonomy'] = None # non matched species will have None in the end
for specie in unique(df["taxonomy_scientific_name"]):
taxid = get_tax_id(specie)
if taxid is None:
continue
data = get_tax_data(taxid)
if len(data) >= 1:
if "Lineage" in data[0].keys():
# add taxonomy to all rows with this specie name
df['taxonomy'][df['taxonomy_scientific_name'] == specie] = data[0]["Lineage"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment