Skip to content

Instantly share code, notes, and snippets.

@JohnLonginotto
Created December 15, 2016 20:07
Show Gist options
  • Save JohnLonginotto/728d3527fe0483c599894d901096e7b8 to your computer and use it in GitHub Desktop.
Save JohnLonginotto/728d3527fe0483c599894d901096e7b8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import os
import csv
import argparse
parser = argparse.ArgumentParser(description='Process output from Mash')
parser.add_argument('-i', '--input', help='input file', metavar='FILE', required=True)
parser.add_argument('-o', '--output', help='output file', metavar='FILE' )
parser.add_argument('-a', '--alias', help='alias file', metavar='FILE' )
parser.add_argument('-n', '--nearness', help='invert distance',action="store_true" )
args = parser.parse_args()
aliases = {}
if args.alias is not None:
with open(args.alias,'rb') as alias_file:
aliases = dict(csv.reader(alias_file, delimiter='\t', quotechar='|'))
##### While the above will work, you should really do more checking on the alias file. Something like this:
#for alias_line, alias_data in enumerate(csv.reader(alias_file, delimiter='\t', quotechar='|')):
# if len(alias_data) == 2:
# alias_from,alias_to = alias_data
# if alias_from in aliases.keys(): print 'ERROR: Line',alias_line+1,'in alias file',args.alias,'contained an alias previously mapped!'; exit()
# if alias_to in aliases.values(): print 'ERROR: Line',alias_line+1,'in alias file',args.alias,'wants to map',alias_from,'to',alias_to,'but the latter has already been assigned!'; exit()
# aliases[alias[0]] = alias[1]
# else: print 'ERROR: Line',alias_line+1,'in alias file',args.alias,'contained more than two columns!'; exit()
if args.output is None:
path,extension = os.path.splitext(os.path.abspath(os.path.expanduser(args.input)))
args.output = path +'.2' + extension
def clean_file_name(file_path):
clean_name = os.path.basename(os.path.splitext(file_path)[0]) # I would remove all extensions
return aliases[clean_name] if clean_name in aliases else clean_name
with open(args.input,'rb') as input_file, open(args.output,'wb') as output_file:
input_csv = csv.reader(input_file, delimiter='\t', quotechar='|')
output_csv = csv.writer(output_file, delimiter='\t', quotechar='|')
output_csv.writerow( [clean_file_name(name) for name in next(input_csv)[1:]] ) #1: gets rid of the #query
for input_data in input_csv:
filename = [ clean_file_name(input_data[0]) ]
if args.nearness: values = [1 - float(x) for x in input_data[1:]] # nearness is not a good choice of words, ideally you want something that makes sense in the context of "if X:" like "if do_invert:".
else: values = input_data[1:]
output_csv.writerow(filename + values)
print "All done! See ", args.output
@JohnLonginotto
Copy link
Author

If you do not wish to remove all file extensions, then instead of using re.sub(r'\.fn?a(st[aq])?$', '', name) and similar, which no biologist would/should ever understand, I would just take the extension, and check if it is in a set of extensions you want to remove the extension of, and then decide to use the path with the extension or not. This will be faster and simpler for your users :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment