Skip to content

Instantly share code, notes, and snippets.

@ShaiberAlon
Last active June 19, 2019 18:48
Show Gist options
  • Save ShaiberAlon/f4bd9c45c242d4367a748b34cd2aa243 to your computer and use it in GitHub Desktop.
Save ShaiberAlon/f4bd9c45c242d4367a748b34cd2aa243 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# Click 'Download > Multiple-file JSON' from NCBI search results page,
# unzip it, run this script in it, this way:
# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt
import sys
import json
import glob
output_file_name = sys.argv[1]
output_file = open(output_file_name, "w")
# poor man's whatever:
QUERY = lambda: hits['BlastOutput2']['report']['results']['search']['query_title'].split('___')[0]
QLEN = lambda: hits['BlastOutput2']['report']['results']['search']['query_len']
HIT = lambda: hits['BlastOutput2']['report']['results']['search']['hits'][index]
DESC = lambda: HIT()['description'][0]
TITLE = lambda: DESC()['title']
SCINAME = lambda: '_%s_' % DESC()['sciname']
ACC = lambda: '[%(desc)s](https://www.ncbi.nlm.nih.gov/protein/%(desc)s)' % {'desc': DESC()['accession']}
HSPS = lambda: HIT()['hsps'][0]
PCTALIGN = lambda: '%.2f%%' % (((HSPS()['align_len']* 100 / QLEN())))
PCTID = lambda: '%.2f%%' % (HSPS()['identity'] * 100 / HSPS()['align_len'])
CONTIG = lambda: hits['BlastOutput2']['report']['results']['search']['query_title']
MESSAGE = lambda: hits['BlastOutput2']['report']['results']['search']['message']
output_file.write('|'.join(['', 'query bin name', 'query contig', 'gene name', 'Best hit on NCBI', 'Percent alignment', 'Percent identity', 'Accession', '']))
output_file.write('\n')
output_file.write('|'.join(['', ':--', ':--', ':--', ':--:', ':--:', ':--:', ':--:', '']))
output_file.write('\n')
# go through every json file in the directory:
for j in glob.glob('*.json'):
hits = json.load(open(j))
# skip the poop file
if 'BlastOutput2' not in hits:
continue
# If there are no hits then report "NO SIGNIFICANT HIT"
if not HITS():
output_file.write('|'.join(['', QUERY(), 'NO SIGNIFICANT HIT', '', '', '', '']))
output_file.write('\n')
continue
# report the best hit:
index = 0
# unless the best hit resolves to a multispecies .. if it does, increment
# index
while 1:
if TITLE().find('MULTISPECIES') == -1:
break
index += 1
# first we get rid of the string added by ncbi blast at the beginning
query_header = CONTIG().split(' ')[1]
query_header_dict = dict([(i.split(':')[0], i.split(':')[1]) for i in query_header.split('|')])
output_file.write('|'.join(['', query_header_dict['bin_id'], query_header_dict['contig'], QUERY(), SCINAME(), PCTALIGN(), PCTID(), ACC(), '']))
output_file.write('\n')
output_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment