Last active
June 7, 2017 12:50
-
-
Save sinnamone/caa5e3d60ec776999df65dc025f5484e to your computer and use it in GitHub Desktop.
get_best_match
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# ordina per query (ASC), score (DESC), MM (ASC) e gap (ASC) | |
import sys | |
from operator import itemgetter | |
if len(sys.argv) < 2+1: | |
sys.exit(""" | |
Usage: %s output_type input_type [file.blast8_] | |
'output_type': | |
'-sort': do only the sort | |
'-best': do the sort and get only the best | |
'input_type': | |
'-0': query, subject, similarity, ... | |
'-1': query, query_len, subject, similarity, ... | |
'-2': query, subject, subject_len, similarity, ... | |
'-3': query, query_len, subject, subject_len, similarity, ... | |
Sort by query (ASC), score (DESC), MM (ASC), gap (ASC) | |
Output in STDOUT | |
""" % sys.argv[0]) | |
work_to_do = sys.argv[1] | |
if work_to_do not in ['-sort','-best']: | |
sys.exit("ERROR: Wrong output_type: %s." % work_to_do) | |
column_type = [ | |
str, # 0, query (order) | |
int, # 1, query_len | |
str, # 2, subject | |
int, # 3, subject_len | |
float, # 4, similarity | |
int, # 5, match_len | |
int, # 6, MM (order) | |
int, # 7, gap (order) | |
int, # 8, q_start | |
int, # 9, q_end | |
int, # 10, s_start | |
int, # 11, s_end | |
float, # 12, e_value | |
float, # 13, score (order) | |
#eventually other fields | |
] | |
t = sys.argv[2] | |
if t == "-0": | |
column_type.pop(1) | |
column_type.pop(2) # sarebbe 3 ma prima ho fatto il 'pop' | |
score_pos = 11 | |
elif t == "-1": | |
column_type.pop(3) | |
score_pos = 12 | |
elif t == "-2": | |
column_type.pop(1) | |
score_pos = 12 | |
elif t == "-3": | |
score_pos = 13 | |
else: | |
sys.exit("ERROR: Wrong input_type: %s." % t) | |
if len(sys.argv)==3: # from STDIN | |
fd = sys.stdin | |
else: | |
fd = open(sys.argv[3]) | |
matches = [i.replace('\n','').split('\t') for i in fd] | |
# se ci sono campi in piu' (ad es. annotazioni) aggiorna column_type | |
for i in matches: | |
if len(i) > score_pos: # extra data | |
column_type.append(str) | |
break | |
if len(column_type) > score_pos+1: # se ho aggiunto il campo | |
for n,i in enumerate(matches): | |
standard_data = i[:score_pos+1] | |
extra_data = "\t".join(i[score_pos+1:]) | |
matches[n] = standard_data | |
matches[n].append(extra_data) | |
# cast dei campi | |
for i in range(len(matches)): | |
for k,v in enumerate(column_type): | |
try: | |
matches[i][k] = v(matches[i][k]) | |
except ValueError: | |
sys.exit("ERROR: Format problem at line %d: is the '%s' option correct??" % (i+1,t)) | |
# ordina per query (ASC), score (DESC), MM (ASC) e gap (ASC) | |
matches.sort(key=itemgetter(score_pos),reverse=True) # sort su score | |
matches.sort(key=itemgetter(0)) # sort su read, MM e gap | |
# scrive | |
if work_to_do == '-sort': | |
for m in matches: | |
print "\t".join(map(str,m)) | |
elif work_to_do == '-best': | |
name = None | |
score = -1 | |
for m in matches: | |
if m[0] == name: | |
if m[score_pos] == score: | |
print "\t".join(map(str,m)) | |
else: # scrive comunque | |
name = m[0] | |
score = m[score_pos] | |
print "\t".join(map(str,m)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment