sinnamone/get_best_match.py

## get_best_match.py
#!/usr/bin/python
# ordina per query (ASC), score (DESC), MM (ASC) e gap (ASC)

import sys
from operator import itemgetter

if len(sys.argv) < 2+1:
	sys.exit("""
	Usage: %s output_type input_type [file.blast8_]

	'output_type':
	'-sort': do only the sort
	'-best': do the sort and get only the best

	'input_type':
	 '-0': query, subject, similarity, ...
	 '-1': query, query_len, subject, similarity, ...
	 '-2': query, subject, subject_len, similarity, ...
	 '-3': query, query_len, subject, subject_len, similarity, ...

	Sort by query (ASC), score (DESC), MM (ASC), gap (ASC)
	Output in STDOUT
	""" % sys.argv[0])

work_to_do = sys.argv[1]
if work_to_do not in ['-sort','-best']:
	sys.exit("ERROR: Wrong output_type: %s." % work_to_do)

column_type = [
	str, # 0,  query (order)
	int, # 1, query_len
	str, # 2, subject
	int, # 3, subject_len
	float, # 4, similarity
	int, # 5, match_len
	int, # 6, MM (order)
	int, # 7, gap (order)
	int, # 8, q_start
	int, # 9, q_end
	int, # 10, s_start
	int, # 11, s_end
	float, # 12, e_value
	float, # 13, score (order)
	#eventually other fields
	]

t = sys.argv[2]

if t == "-0":
	column_type.pop(1)
	column_type.pop(2) # sarebbe 3 ma prima ho fatto il 'pop'
	score_pos = 11
elif t == "-1":
	column_type.pop(3)
	score_pos = 12
elif t == "-2":
	column_type.pop(1)
	score_pos = 12
elif t == "-3":
	score_pos = 13
else:
	sys.exit("ERROR: Wrong input_type: %s." % t)

if len(sys.argv)==3: # from STDIN
	fd = sys.stdin
else:
	fd = open(sys.argv[3])
matches = [i.replace('\n','').split('\t') for i in fd]

# se ci sono campi in piu' (ad es. annotazioni) aggiorna column_type
for i in matches:
	if len(i) > score_pos: # extra data
		column_type.append(str)
		break
if len(column_type) > score_pos+1: # se ho aggiunto il campo
	for n,i in enumerate(matches):
		standard_data = i[:score_pos+1]
		extra_data = "\t".join(i[score_pos+1:])
		matches[n] = standard_data
		matches[n].append(extra_data)

# cast dei campi
for i in range(len(matches)):
	for k,v in enumerate(column_type):
		try:
			matches[i][k] = v(matches[i][k])
		except ValueError:
			sys.exit("ERROR: Format problem at line %d: is the '%s' option correct??" % (i+1,t))

# ordina per query (ASC), score (DESC), MM (ASC) e gap (ASC)
matches.sort(key=itemgetter(score_pos),reverse=True) # sort su score
matches.sort(key=itemgetter(0)) # sort su read, MM e gap

# scrive
if work_to_do == '-sort':
	for m in matches:
		print "\t".join(map(str,m))
elif work_to_do == '-best':
	name = None
	score = -1
	for m in matches:
		if m[0] == name:
			if m[score_pos] == score:
				print "\t".join(map(str,m))
		else: # scrive comunque
			name = m[0]
			score = m[score_pos]
			print "\t".join(map(str,m))
	#!/usr/bin/python
	# ordina per query (ASC), score (DESC), MM (ASC) e gap (ASC)

	import sys
	from operator import itemgetter

	if len(sys.argv) < 2+1:
	sys.exit("""
	Usage: %s output_type input_type [file.blast8_]

	'output_type':
	'-sort': do only the sort
	'-best': do the sort and get only the best

	'input_type':
	'-0': query, subject, similarity, ...
	'-1': query, query_len, subject, similarity, ...
	'-2': query, subject, subject_len, similarity, ...
	'-3': query, query_len, subject, subject_len, similarity, ...

	Sort by query (ASC), score (DESC), MM (ASC), gap (ASC)
	Output in STDOUT
	""" % sys.argv[0])

	work_to_do = sys.argv[1]
	if work_to_do not in ['-sort','-best']:
	sys.exit("ERROR: Wrong output_type: %s." % work_to_do)

	column_type = [
	str, # 0, query (order)
	int, # 1, query_len
	str, # 2, subject
	int, # 3, subject_len
	float, # 4, similarity
	int, # 5, match_len
	int, # 6, MM (order)
	int, # 7, gap (order)
	int, # 8, q_start
	int, # 9, q_end
	int, # 10, s_start
	int, # 11, s_end
	float, # 12, e_value
	float, # 13, score (order)
	#eventually other fields
	]

	t = sys.argv[2]

	if t == "-0":
	column_type.pop(1)
	column_type.pop(2) # sarebbe 3 ma prima ho fatto il 'pop'
	score_pos = 11
	elif t == "-1":
	column_type.pop(3)
	score_pos = 12
	elif t == "-2":
	column_type.pop(1)
	score_pos = 12
	elif t == "-3":
	score_pos = 13
	else:
	sys.exit("ERROR: Wrong input_type: %s." % t)

	if len(sys.argv)==3: # from STDIN
	fd = sys.stdin
	else:
	fd = open(sys.argv[3])
	matches = [i.replace('\n','').split('\t') for i in fd]

	# se ci sono campi in piu' (ad es. annotazioni) aggiorna column_type
	for i in matches:
	if len(i) > score_pos: # extra data
	column_type.append(str)
	break
	if len(column_type) > score_pos+1: # se ho aggiunto il campo
	for n,i in enumerate(matches):
	standard_data = i[:score_pos+1]
	extra_data = "\t".join(i[score_pos+1:])
	matches[n] = standard_data
	matches[n].append(extra_data)

	# cast dei campi
	for i in range(len(matches)):
	for k,v in enumerate(column_type):
	try:
	matches[i][k] = v(matches[i][k])
	except ValueError:
	sys.exit("ERROR: Format problem at line %d: is the '%s' option correct??" % (i+1,t))

	# ordina per query (ASC), score (DESC), MM (ASC) e gap (ASC)
	matches.sort(key=itemgetter(score_pos),reverse=True) # sort su score
	matches.sort(key=itemgetter(0)) # sort su read, MM e gap

	# scrive
	if work_to_do == '-sort':
	for m in matches:
	print "\t".join(map(str,m))
	elif work_to_do == '-best':
	name = None
	score = -1
	for m in matches:
	if m[0] == name:
	if m[score_pos] == score:
	print "\t".join(map(str,m))
	else: # scrive comunque
	name = m[0]
	score = m[score_pos]
	print "\t".join(map(str,m))