Created
December 13, 2018 12:34
-
-
Save fungs/17cddb090c4f2b4f50c6a32044635beb to your computer and use it in GitHub Desktop.
Query NCBI taxon IDs online using accession
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Queries NCBI Entrez for taxon IDs given accessions | |
# Author: code@fungs.de | |
# License: free to use for everybody | |
from xml.etree.ElementTree import fromstring as xmlstringparse, ParseError as XMLParseError | |
from urllib.request import urlopen | |
from urllib.error import HTTPError | |
from time import sleep | |
from itertools import islice | |
from sys import stdin, stderr, exit | |
blocksize = 200 # according to NCBIs entrez GET documentation max is 250 but sometimes URI gets too long then | |
download_max_tries = 10 # try download again for 10 times | |
download_error_delay = 30 # wait for 30 sec if something happens, increasing delay with each request | |
ncbi_efetch_address = "https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" | |
ncbi_efetch_db = "nuccore" | |
# TODO | |
# * handling malformed XML by repeating with reduced accession list | |
# * parse HTML errors and react to errors | |
accessions = (line.rstrip() for line in stdin) | |
def inblocks(items, blocksize): | |
block = list(islice(items, blocksize)) | |
while block: | |
yield block | |
block = list(islice(items, blocksize)) | |
for acclist in inblocks(accessions, blocksize): | |
while True: | |
failcounter = 0 | |
while True: | |
try: | |
xmltext = urlopen("{}?db={}&format=xml&id={}".format(ncbi_efetch_address, ncbi_efetch_db, ",".join(acclist))).read() | |
break # successful read | |
except HTTPError as e: | |
print("Problem downloading data:", e, file=stderr) | |
if failcounter > download_max_tries: | |
exit(-1) # too many failures | |
failcounter += 1 | |
sleep(failcounter*download_error_delay) # wait a little before restarting | |
try: | |
xmltree = xmlstringparse(xmltext) | |
except XMLParseError as e: | |
print("Problem parsing data:", e, file=stderr) | |
continue | |
for gbseq in xmltree: | |
acc = gbseq.find("GBSeq_primary-accession").text | |
acc_ver = gbseq.find("GBSeq_accession-version").text | |
taxid = gbseq.find("GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier[GBQualifier_name='db_xref']/GBQualifier_value").text.split(":",2)[1] | |
print("{}\t{}\t{}".format(acc, acc_ver, taxid)) | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment