ShaiberAlon/summarize_blast_results_of_anvio_HMMs.py

## summarize_blast_results_of_anvio_HMMs.py
#!/usr/bin/env python

# Click 'Download > Multiple-file JSON' from NCBI search results page,
# unzip it, run this script in it, this way:
# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt

import sys
import json
import glob

output_file_name = sys.argv[1]
output_file = open(output_file_name, "w")

# poor man's whatever:
QUERY    = lambda: hits['BlastOutput2']['report']['results']['search']['query_title'].split('___')[0]
QLEN     = lambda: hits['BlastOutput2']['report']['results']['search']['query_len']
HIT      = lambda: hits['BlastOutput2']['report']['results']['search']['hits'][index]
DESC     = lambda: HIT()['description'][0]
TITLE    = lambda: DESC()['title']
SCINAME  = lambda: '_%s_' % DESC()['sciname']
ACC      = lambda: '[%(desc)s](https://www.ncbi.nlm.nih.gov/protein/%(desc)s)' % {'desc': DESC()['accession']}
HSPS     = lambda: HIT()['hsps'][0]
PCTALIGN = lambda: '%.2f%%' % (((HSPS()['align_len']* 100 / QLEN())))
PCTID    = lambda: '%.2f%%' % (HSPS()['identity'] * 100 / HSPS()['align_len'])
CONTIG   = lambda: hits['BlastOutput2']['report']['results']['search']['query_title']
MESSAGE  = lambda: hits['BlastOutput2']['report']['results']['search']['message']


output_file.write('|'.join(['', 'query bin name', 'query contig', 'gene name', 'Best hit on NCBI', 'Percent alignment', 'Percent identity', 'Accession', '']))
output_file.write('\n')
output_file.write('|'.join(['', ':--', ':--', ':--', ':--:', ':--:', ':--:', ':--:', '']))
output_file.write('\n')


# go through every json file in the directory:
for j in glob.glob('*.json'):
    hits = json.load(open(j))

    # skip the poop file
    if 'BlastOutput2' not in hits:
        continue
    # If there are no hits then report "NO SIGNIFICANT HIT"
    if not HITS():
        output_file.write('|'.join(['', QUERY(), 'NO SIGNIFICANT HIT', '', '', '', '']))
        output_file.write('\n')
        continue

    # report the best hit:
    index = 0

    # unless the best hit resolves to a multispecies .. if it does, increment
    # index
    while 1:
        if TITLE().find('MULTISPECIES') == -1:
            break

        index += 1

    # first we get rid of the string added by ncbi blast at the beginning
    query_header = CONTIG().split(' ')[1]
    query_header_dict = dict([(i.split(':')[0], i.split(':')[1]) for i in query_header.split('|')])

    output_file.write('|'.join(['', query_header_dict['bin_id'], query_header_dict['contig'], QUERY(), SCINAME(), PCTALIGN(), PCTID(), ACC(), '']))
    output_file.write('\n')

output_file.close()
	#!/usr/bin/env python

	# Click 'Download > Multiple-file JSON' from NCBI search results page,
	# unzip it, run this script in it, this way:
	# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt

	import sys
	import json
	import glob

	output_file_name = sys.argv[1]
	output_file = open(output_file_name, "w")

	# poor man's whatever:
	QUERY = lambda: hits['BlastOutput2']['report']['results']['search']['query_title'].split('___')[0]
	QLEN = lambda: hits['BlastOutput2']['report']['results']['search']['query_len']
	HIT = lambda: hits['BlastOutput2']['report']['results']['search']['hits'][index]
	DESC = lambda: HIT()['description'][0]
	TITLE = lambda: DESC()['title']
	SCINAME = lambda: '_%s_' % DESC()['sciname']
	ACC = lambda: '[%(desc)s](https://www.ncbi.nlm.nih.gov/protein/%(desc)s)' % {'desc': DESC()['accession']}
	HSPS = lambda: HIT()['hsps'][0]
	PCTALIGN = lambda: '%.2f%%' % (((HSPS()['align_len']* 100 / QLEN())))
	PCTID = lambda: '%.2f%%' % (HSPS()['identity'] * 100 / HSPS()['align_len'])
	CONTIG = lambda: hits['BlastOutput2']['report']['results']['search']['query_title']
	MESSAGE = lambda: hits['BlastOutput2']['report']['results']['search']['message']


	output_file.write('\|'.join(['', 'query bin name', 'query contig', 'gene name', 'Best hit on NCBI', 'Percent alignment', 'Percent identity', 'Accession', '']))
	output_file.write('\n')
	output_file.write('\|'.join(['', ':--', ':--', ':--', ':--:', ':--:', ':--:', ':--:', '']))
	output_file.write('\n')


	# go through every json file in the directory:
	for j in glob.glob('*.json'):
	hits = json.load(open(j))

	# skip the poop file
	if 'BlastOutput2' not in hits:
	continue
	# If there are no hits then report "NO SIGNIFICANT HIT"
	if not HITS():
	output_file.write('\|'.join(['', QUERY(), 'NO SIGNIFICANT HIT', '', '', '', '']))
	output_file.write('\n')
	continue

	# report the best hit:
	index = 0

	# unless the best hit resolves to a multispecies .. if it does, increment
	# index
	while 1:
	if TITLE().find('MULTISPECIES') == -1:
	break

	index += 1

	# first we get rid of the string added by ncbi blast at the beginning
	query_header = CONTIG().split(' ')[1]
	query_header_dict = dict([(i.split(':')[0], i.split(':')[1]) for i in query_header.split('\|')])

	output_file.write('\|'.join(['', query_header_dict['bin_id'], query_header_dict['contig'], QUERY(), SCINAME(), PCTALIGN(), PCTID(), ACC(), '']))
	output_file.write('\n')

	output_file.close()