Last active
August 29, 2015 14:15
-
-
Save afrendeiro/edc5af41628c886abe0a to your computer and use it in GitHub Desktop.
Get taxonomic distribution of interpro domains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from biomart import BiomartServer, BiomartDataset | |
from Bio import Entrez | |
def get_tax_id(specie): | |
"""Get taxon ID for specie.""" | |
specie = specie.replace(" ", "+").strip() | |
search = Entrez.esearch(term=specie, db="taxonomy", retmode="xml") | |
record = Entrez.read(search) | |
if int(record["Count"]) == 0: | |
return None | |
if "IdList" in record.keys(): | |
return record['IdList'][0] | |
def get_tax_data(taxid): | |
"""Fetch the record of a taxon ID.""" | |
search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml") | |
return Entrez.read(search) | |
# query interpro domains for the prots | |
ids = ["P01106", "P17947"] # some examples | |
server = BiomartServer("http://www.biomart.org/biomart") | |
uniprot = server.datasets['uniprot'] | |
attributes = ['accession', 'ensembl_id', 'entry_type', 'gene_name', 'name', 'interpro_id'] | |
response = uniprot.search({ | |
'filters': {'accession': ids}, | |
'attributes': attributes | |
}) | |
df = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))], | |
columns=attributes | |
) | |
# Query taxonomies with domains | |
domains = df['interpro_id'] | |
attributes = ['entry_id', 'entry_type', 'entry_name', 'taxonomy_scientific_name'] | |
interpro = BiomartDataset("http://www.biomart.org/biomart", name='entry') | |
for domain in domains: | |
response = interpro.search({ | |
'filters': {'entry_id': domain}, | |
'attributes': attributes | |
}) | |
taxons = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))], | |
columns=attributes | |
) | |
if domain == domains[0]: | |
df = taxons | |
else: | |
df = pd.concat([df, taxons]) | |
# get full taxonomy lineage of species | |
Entrez.email = "" # enter your email here | |
df['taxonomy'] = None # non matched species will have None in the end | |
for specie in unique(df["taxonomy_scientific_name"]): | |
taxid = get_tax_id(specie) | |
if taxid is None: | |
continue | |
data = get_tax_data(taxid) | |
if len(data) >= 1: | |
if "Lineage" in data[0].keys(): | |
# add taxonomy to all rows with this specie name | |
df['taxonomy'][df['taxonomy_scientific_name'] == specie] = data[0]["Lineage"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment