Skip to content

Instantly share code, notes, and snippets.

@fjossinet fjossinet/
Last active Dec 17, 2015

What would you like to do?
This python script recovers the genbank ids for all the nucleotide entries linked to a taxon id. The number of requests is minimized using the retmax and retstart parameters provided by the Entrez Utilities.
#!/usr/bin/env python
import xml.etree.ElementTree as ET
import sys, urllib, urllib2
eutils_base_url = ""
def get_ids(taxid):
accession_numbers =[]
retstart = 0
iteration_step = 10000
while True:
result = esearch(db = "nucleotide", term = "txid%s[Organism:exp]"%taxid, retstart = retstart, retmax = iteration_step)
result = ET.fromstring(result)
ids = []
if result.find('IdList') is not None:
for id in result.find('IdList').findall('Id'):
result = esummary(db = "nucleotide", ids = ids, retmax = iteration_step)
result = ET.fromstring(result)
for docsum in result.findall('DocSum'):
for item in docsum.findall("Item[@Name='Caption']"):
except Exception, e:
print e
retstart += iteration_step
return accession_numbers
def esearch(db, term, retstart = 0, retmax = 20):
response = urllib.urlopen("%sesearch.fcgi?db=%s&term=%s&retstart=%i&retmax=%i"%(eutils_base_url, db, term, retstart, retmax))
content = str(
return content
def esummary(db, ids, retstart = 0, retmax = 20):
data = {
data = urllib.urlencode(data)
req = urllib2.Request("%sesummary.fcgi"%eutils_base_url, data)
response = urllib2.urlopen(req)
content = str(
return content
if __name__ == '__main__':
taxid = None
if "-id" in sys.argv:
taxid = sys.argv[sys.argv.index("-id")+1]
if not taxid:
print "Usage: -id taxid"
print "Example: -id 4754"
ids = get_ids(taxid)
print ids
print "%i ids found..."%len(ids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.