Skip to content

Instantly share code, notes, and snippets.

@fungs
Created December 13, 2018 12:34
Show Gist options
  • Save fungs/17cddb090c4f2b4f50c6a32044635beb to your computer and use it in GitHub Desktop.
Save fungs/17cddb090c4f2b4f50c6a32044635beb to your computer and use it in GitHub Desktop.
Query NCBI taxon IDs online using accession
#!/usr/bin/env python3
# Queries NCBI Entrez for taxon IDs given accessions
# Author: code@fungs.de
# License: free to use for everybody
from xml.etree.ElementTree import fromstring as xmlstringparse, ParseError as XMLParseError
from urllib.request import urlopen
from urllib.error import HTTPError
from time import sleep
from itertools import islice
from sys import stdin, stderr, exit
blocksize = 200 # according to NCBIs entrez GET documentation max is 250 but sometimes URI gets too long then
download_max_tries = 10 # try download again for 10 times
download_error_delay = 30 # wait for 30 sec if something happens, increasing delay with each request
ncbi_efetch_address = "https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
ncbi_efetch_db = "nuccore"
# TODO
# * handling malformed XML by repeating with reduced accession list
# * parse HTML errors and react to errors
accessions = (line.rstrip() for line in stdin)
def inblocks(items, blocksize):
block = list(islice(items, blocksize))
while block:
yield block
block = list(islice(items, blocksize))
for acclist in inblocks(accessions, blocksize):
while True:
failcounter = 0
while True:
try:
xmltext = urlopen("{}?db={}&format=xml&id={}".format(ncbi_efetch_address, ncbi_efetch_db, ",".join(acclist))).read()
break # successful read
except HTTPError as e:
print("Problem downloading data:", e, file=stderr)
if failcounter > download_max_tries:
exit(-1) # too many failures
failcounter += 1
sleep(failcounter*download_error_delay) # wait a little before restarting
try:
xmltree = xmlstringparse(xmltext)
except XMLParseError as e:
print("Problem parsing data:", e, file=stderr)
continue
for gbseq in xmltree:
acc = gbseq.find("GBSeq_primary-accession").text
acc_ver = gbseq.find("GBSeq_accession-version").text
taxid = gbseq.find("GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier[GBQualifier_name='db_xref']/GBQualifier_value").text.split(":",2)[1]
print("{}\t{}\t{}".format(acc, acc_ver, taxid))
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment