fungs/accession2taxon.py3

## accession2taxon.py3
#!/usr/bin/env python3
# Queries NCBI Entrez for taxon IDs given accessions
# Author: code@fungs.de
# License: free to use for everybody

from xml.etree.ElementTree import fromstring as xmlstringparse, ParseError as XMLParseError
from urllib.request import urlopen
from urllib.error import HTTPError
from time import sleep
from itertools import islice
from sys import stdin, stderr, exit

blocksize = 200            # according to NCBIs entrez GET documentation max is 250 but sometimes URI gets too long then
download_max_tries = 10    # try download again for 10 times
download_error_delay = 30  # wait for 30 sec if something happens, increasing delay with each request

ncbi_efetch_address = "https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
ncbi_efetch_db = "nuccore"

# TODO
# * handling malformed XML by repeating with reduced accession list
# * parse HTML errors and react to errors

accessions = (line.rstrip() for line in stdin)

def inblocks(items, blocksize):
  block = list(islice(items, blocksize))
  while block:
    yield block
    block = list(islice(items, blocksize))

for acclist in inblocks(accessions, blocksize):
  while True:
    failcounter = 0
    while True:
      try:
        xmltext = urlopen("{}?db={}&format=xml&id={}".format(ncbi_efetch_address, ncbi_efetch_db, ",".join(acclist))).read()
        break  # successful read
      except HTTPError as e:
        print("Problem downloading data:", e, file=stderr)
        if failcounter > download_max_tries:
          exit(-1)  # too many failures
        failcounter += 1
        sleep(failcounter*download_error_delay)  # wait a little before restarting
    try:
      xmltree = xmlstringparse(xmltext)
    except XMLParseError as e:
      print("Problem parsing data:", e, file=stderr)
      continue

    for gbseq in xmltree:
      acc = gbseq.find("GBSeq_primary-accession").text
      acc_ver = gbseq.find("GBSeq_accession-version").text
      taxid = gbseq.find("GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier[GBQualifier_name='db_xref']/GBQualifier_value").text.split(":",2)[1]
      print("{}\t{}\t{}".format(acc, acc_ver, taxid))
    break
	#!/usr/bin/env python3
	# Queries NCBI Entrez for taxon IDs given accessions
	# Author: code@fungs.de
	# License: free to use for everybody

	from xml.etree.ElementTree import fromstring as xmlstringparse, ParseError as XMLParseError
	from urllib.request import urlopen
	from urllib.error import HTTPError
	from time import sleep
	from itertools import islice
	from sys import stdin, stderr, exit

	blocksize = 200 # according to NCBIs entrez GET documentation max is 250 but sometimes URI gets too long then
	download_max_tries = 10 # try download again for 10 times
	download_error_delay = 30 # wait for 30 sec if something happens, increasing delay with each request

	ncbi_efetch_address = "https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
	ncbi_efetch_db = "nuccore"

	# TODO
	# * handling malformed XML by repeating with reduced accession list
	# * parse HTML errors and react to errors

	accessions = (line.rstrip() for line in stdin)

	def inblocks(items, blocksize):
	block = list(islice(items, blocksize))
	while block:
	yield block
	block = list(islice(items, blocksize))

	for acclist in inblocks(accessions, blocksize):
	while True:
	failcounter = 0
	while True:
	try:
	xmltext = urlopen("{}?db={}&format=xml&id={}".format(ncbi_efetch_address, ncbi_efetch_db, ",".join(acclist))).read()
	break # successful read
	except HTTPError as e:
	print("Problem downloading data:", e, file=stderr)
	if failcounter > download_max_tries:
	exit(-1) # too many failures
	failcounter += 1
	sleep(failcounter*download_error_delay) # wait a little before restarting
	try:
	xmltree = xmlstringparse(xmltext)
	except XMLParseError as e:
	print("Problem parsing data:", e, file=stderr)
	continue

	for gbseq in xmltree:
	acc = gbseq.find("GBSeq_primary-accession").text
	acc_ver = gbseq.find("GBSeq_accession-version").text
	taxid = gbseq.find("GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier[GBQualifier_name='db_xref']/GBQualifier_value").text.split(":",2)[1]
	print("{}\t{}\t{}".format(acc, acc_ver, taxid))
	break