Skip to content

Instantly share code, notes, and snippets.

@fjossinet
Last active December 17, 2015 21:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fjossinet/5673672 to your computer and use it in GitHub Desktop.
Save fjossinet/5673672 to your computer and use it in GitHub Desktop.
This python script recovers the genbank ids for all the nucleotide entries linked to a taxon id. The number of requests is minimized using the retmax and retstart parameters provided by the Entrez Utilities.
#!/usr/bin/env python
import xml.etree.ElementTree as ET
import sys, urllib, urllib2
eutils_base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
def get_ids(taxid):
accession_numbers =[]
retstart = 0
iteration_step = 10000
while True:
result = esearch(db = "nucleotide", term = "txid%s[Organism:exp]"%taxid, retstart = retstart, retmax = iteration_step)
try:
result = ET.fromstring(result)
ids = []
if result.find('IdList') is not None:
for id in result.find('IdList').findall('Id'):
ids.append(id.text)
result = esummary(db = "nucleotide", ids = ids, retmax = iteration_step)
result = ET.fromstring(result)
for docsum in result.findall('DocSum'):
for item in docsum.findall("Item[@Name='Caption']"):
accession_numbers.append(item.text)
else:
break
except Exception, e:
print e
retstart += iteration_step
return accession_numbers
def esearch(db, term, retstart = 0, retmax = 20):
response = urllib.urlopen("%sesearch.fcgi?db=%s&term=%s&retstart=%i&retmax=%i"%(eutils_base_url, db, term, retstart, retmax))
content = str(response.read())
response.close()
return content
def esummary(db, ids, retstart = 0, retmax = 20):
data = {
'db':db,
'id':','.join(ids)
}
data = urllib.urlencode(data)
req = urllib2.Request("%sesummary.fcgi"%eutils_base_url, data)
response = urllib2.urlopen(req)
content = str(response.read())
response.close()
return content
if __name__ == '__main__':
taxid = None
if "-id" in sys.argv:
taxid = sys.argv[sys.argv.index("-id")+1]
if not taxid:
print "Usage: taxid_2_gbids.py -id taxid"
print "Example: taxid_2_gbids.py -id 4754"
sys.exit(-1)
ids = get_ids(taxid)
print ids
print "%i ids found..."%len(ids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment