Skip to content

Instantly share code, notes, and snippets.



Last active Dec 17, 2015
What would you like to do?
This python script recovers the genbank ids for all the nucleotide entries linked to a taxon id. The number of requests is minimized using the retmax and retstart parameters provided by the Entrez Utilities.
#!/usr/bin/env python
import xml.etree.ElementTree as ET
import sys, urllib, urllib2
eutils_base_url = ""
def get_ids(taxid):
accession_numbers =[]
retstart = 0
iteration_step = 10000
while True:
result = esearch(db = "nucleotide", term = "txid%s[Organism:exp]"%taxid, retstart = retstart, retmax = iteration_step)
result = ET.fromstring(result)
ids = []
if result.find('IdList') is not None:
for id in result.find('IdList').findall('Id'):
result = esummary(db = "nucleotide", ids = ids, retmax = iteration_step)
result = ET.fromstring(result)
for docsum in result.findall('DocSum'):
for item in docsum.findall("Item[@Name='Caption']"):
except Exception, e:
print e
retstart += iteration_step
return accession_numbers
def esearch(db, term, retstart = 0, retmax = 20):
response = urllib.urlopen("%sesearch.fcgi?db=%s&term=%s&retstart=%i&retmax=%i"%(eutils_base_url, db, term, retstart, retmax))
content = str(
return content
def esummary(db, ids, retstart = 0, retmax = 20):
data = {
data = urllib.urlencode(data)
req = urllib2.Request("%sesummary.fcgi"%eutils_base_url, data)
response = urllib2.urlopen(req)
content = str(
return content
if __name__ == '__main__':
taxid = None
if "-id" in sys.argv:
taxid = sys.argv[sys.argv.index("-id")+1]
if not taxid:
print "Usage: -id taxid"
print "Example: -id 4754"
ids = get_ids(taxid)
print ids
print "%i ids found..."%len(ids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.