Last active
March 5, 2020 02:10
-
-
Save ShaiberAlon/0910e175d5414e8893b805f30d5ea025 to your computer and use it in GitHub Desktop.
Summarize blast results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Click 'Download > Multiple-file JSON' from NCBI search results page, | |
# unzip it, run this script in it, this way: | |
# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt | |
import sys | |
import json | |
import glob | |
output_file_name = sys.argv[1] | |
output_file = open(output_file_name, "w") | |
# poor man's whatever: | |
QUERY = lambda: hits['BlastOutput2']['report']['results']['search']['query_title'].split('___')[0] | |
QLEN = lambda: hits['BlastOutput2']['report']['results']['search']['query_len'] | |
HITS = lambda: hits['BlastOutput2']['report']['results']['search']['hits'] | |
HIT = lambda: HITS()[index] | |
DESC = lambda: HIT()['description'][0] | |
TITLE = lambda: DESC()['title'] | |
SCINAME = lambda: '_%s_' % DESC()['sciname'] | |
ACC = lambda: '[%(desc)s](https://www.ncbi.nlm.nih.gov/protein/%(desc)s)' % {'desc': DESC()['accession']} | |
HSPS = lambda: HIT()['hsps'][0] | |
PCTALIGN = lambda: '%.2f%%' % (((HSPS()['align_len']* 100 / QLEN()))) | |
PCTID = lambda: '%.2f%%' % (HSPS()['identity'] * 100 / HSPS()['align_len']) | |
CONTIG = lambda: hits['BlastOutput2']['report']['results']['search']['query_title'] | |
output_file.write('|'.join(['', 'query bin name', 'Best hit on NCBI', 'Percent alignment', 'Percent identity', 'Accession', ''])) | |
output_file.write('\n') | |
output_file.write('|'.join(['', ':--', ':--', ':--', ':--:', ':--:', ':--:', ':--:', ''])) | |
output_file.write('\n') | |
# go through every json file in the directory: | |
for j in glob.glob('*.json'): | |
hits = json.load(open(j)) | |
# skip the poop file | |
if 'BlastOutput2' not in hits: | |
continue | |
if not HITS(): | |
output_file.write('|'.join(['', QUERY(), 'NO SIGNIFICANT HIT', '', '', '', ''])) | |
output_file.write('\n') | |
continue | |
# report the best hit: | |
index = 0 | |
# unless the best hit resolves to a multispecies .. if it does, increment | |
# index | |
while 1: | |
if TITLE().find('MULTISPECIES') == -1: | |
break | |
index += 1 | |
output_file.write('|'.join(['', QUERY(), SCINAME(), PCTALIGN(), PCTID(), ACC(), ''])) | |
output_file.write('\n') | |
output_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Putting a note here that in the current implementation sometimes percent alignment is >100% and this is because when there are gaps in alignment then the actual alignment length is indeed longer than the query. But this value makes no sense and is not what we want. I wonder if it would be better to take the values of
query_from
andquery_to
from the json output and compute the alignment coverage of the query to replace the alignment length.