Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Last active August 29, 2015 14:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/edc5af41628c886abe0a to your computer and use it in GitHub Desktop.
Save afrendeiro/edc5af41628c886abe0a to your computer and use it in GitHub Desktop.
Get taxonomic distribution of interpro domains
import pandas as pd
from biomart import BiomartServer, BiomartDataset
from Bio import Entrez
def get_tax_id(specie):
"""Get taxon ID for specie."""
specie = specie.replace(" ", "+").strip()
search = Entrez.esearch(term=specie, db="taxonomy", retmode="xml")
record = Entrez.read(search)
if int(record["Count"]) == 0:
return None
if "IdList" in record.keys():
return record['IdList'][0]
def get_tax_data(taxid):
"""Fetch the record of a taxon ID."""
search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
return Entrez.read(search)
# query interpro domains for the prots
ids = ["P01106", "P17947"] # some examples
server = BiomartServer("http://www.biomart.org/biomart")
uniprot = server.datasets['uniprot']
attributes = ['accession', 'ensembl_id', 'entry_type', 'gene_name', 'name', 'interpro_id']
response = uniprot.search({
'filters': {'accession': ids},
'attributes': attributes
})
df = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
columns=attributes
)
# Query taxonomies with domains
domains = df['interpro_id']
attributes = ['entry_id', 'entry_type', 'entry_name', 'taxonomy_scientific_name']
interpro = BiomartDataset("http://www.biomart.org/biomart", name='entry')
for domain in domains:
response = interpro.search({
'filters': {'entry_id': domain},
'attributes': attributes
})
taxons = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
columns=attributes
)
if domain == domains[0]:
df = taxons
else:
df = pd.concat([df, taxons])
# get full taxonomy lineage of species
Entrez.email = "" # enter your email here
df['taxonomy'] = None # non matched species will have None in the end
for specie in unique(df["taxonomy_scientific_name"]):
taxid = get_tax_id(specie)
if taxid is None:
continue
data = get_tax_data(taxid)
if len(data) >= 1:
if "Lineage" in data[0].keys():
# add taxonomy to all rows with this specie name
df['taxonomy'][df['taxonomy_scientific_name'] == specie] = data[0]["Lineage"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment