afrendeiro/taxon_distribution_interpro_domains.py

## taxon_distribution_interpro_domains.py
import pandas as pd
from biomart import BiomartServer, BiomartDataset
from Bio import Entrez


def get_tax_id(specie):
    """Get taxon ID for specie."""
    specie = specie.replace(" ", "+").strip()
    search = Entrez.esearch(term=specie, db="taxonomy", retmode="xml")
    record = Entrez.read(search)
    if int(record["Count"]) == 0:
        return None
    if "IdList" in record.keys():
        return record['IdList'][0]


def get_tax_data(taxid):
    """Fetch the record of a taxon ID."""
    search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
    return Entrez.read(search)


# query interpro domains for the prots
ids = ["P01106", "P17947"] # some examples

server = BiomartServer("http://www.biomart.org/biomart")
uniprot = server.datasets['uniprot']

attributes = ['accession', 'ensembl_id', 'entry_type', 'gene_name', 'name', 'interpro_id']
response = uniprot.search({
    'filters': {'accession': ids},
    'attributes': attributes
})

df = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
    				  columns=attributes
    				  )

# Query taxonomies with domains
domains = df['interpro_id']
attributes = ['entry_id', 'entry_type', 'entry_name', 'taxonomy_scientific_name']

interpro = BiomartDataset("http://www.biomart.org/biomart", name='entry')

for domain in domains:
    response = interpro.search({
        'filters': {'entry_id': domain},
        'attributes': attributes
    })
    taxons = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
				          columns=attributes
				    	  )
    if domain == domains[0]:
        df = taxons
    else:
        df = pd.concat([df, taxons])

# get full taxonomy lineage of species
Entrez.email = "" # enter your email here

df['taxonomy'] = None  # non matched species will have None in the end
for specie in unique(df["taxonomy_scientific_name"]):
    taxid = get_tax_id(specie)
    if taxid is None:
        continue
    data = get_tax_data(taxid)
    if len(data) >= 1:
        if "Lineage" in data[0].keys():
        	# add taxonomy to all rows with this specie name
            df['taxonomy'][df['taxonomy_scientific_name'] == specie] = data[0]["Lineage"]
	import pandas as pd
	from biomart import BiomartServer, BiomartDataset
	from Bio import Entrez


	def get_tax_id(specie):
	"""Get taxon ID for specie."""
	specie = specie.replace(" ", "+").strip()
	search = Entrez.esearch(term=specie, db="taxonomy", retmode="xml")
	record = Entrez.read(search)
	if int(record["Count"]) == 0:
	return None
	if "IdList" in record.keys():
	return record['IdList'][0]


	def get_tax_data(taxid):
	"""Fetch the record of a taxon ID."""
	search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
	return Entrez.read(search)


	# query interpro domains for the prots
	ids = ["P01106", "P17947"] # some examples

	server = BiomartServer("http://www.biomart.org/biomart")
	uniprot = server.datasets['uniprot']

	attributes = ['accession', 'ensembl_id', 'entry_type', 'gene_name', 'name', 'interpro_id']
	response = uniprot.search({
	'filters': {'accession': ids},
	'attributes': attributes
	})

	df = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
	columns=attributes
	)

	# Query taxonomies with domains
	domains = df['interpro_id']
	attributes = ['entry_id', 'entry_type', 'entry_name', 'taxonomy_scientific_name']

	interpro = BiomartDataset("http://www.biomart.org/biomart", name='entry')

	for domain in domains:
	response = interpro.search({
	'filters': {'entry_id': domain},
	'attributes': attributes
	})
	taxons = pd.DataFrame([line.split("\t") for line in list(response.content.strip().split("\n"))],
	columns=attributes
	)
	if domain == domains[0]:
	df = taxons
	else:
	df = pd.concat([df, taxons])

	# get full taxonomy lineage of species
	Entrez.email = "" # enter your email here

	df['taxonomy'] = None # non matched species will have None in the end
	for specie in unique(df["taxonomy_scientific_name"]):
	taxid = get_tax_id(specie)
	if taxid is None:
	continue
	data = get_tax_data(taxid)
	if len(data) >= 1:
	if "Lineage" in data[0].keys():
	# add taxonomy to all rows with this specie name
	df['taxonomy'][df['taxonomy_scientific_name'] == specie] = data[0]["Lineage"]