Created
April 29, 2013 21:38
-
-
Save jonchang/5485018 to your computer and use it in GitHub Desktop.
canonizes synonyms and drops duplicates in alignment files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# canonize_synonyms.py -- by Jonathan Chang (March 2013) | |
import argparse | |
import os.path | |
import multiprocessing | |
import functools | |
import csv | |
import collections | |
from cStringIO import StringIO | |
import dendropy | |
def get_args(): | |
parser = argparse.ArgumentParser(description="canonizes synonyms and drops duplicates in alignment files") | |
parser.add_argument("files", nargs="+", help="list of file(s) to convert") | |
required = parser.add_argument_group("required arguments") | |
required.add_argument("--map", help="csv file that has a map of synonym->real name", required=True) | |
return parser.parse_args() | |
def acc_fix(name, mapping): | |
splat = name.split("_") | |
cmpname = "_".join(splat[:-1]) | |
if cmpname in mapping.iterkeys(): | |
cmpname = mapping[cmpname] | |
return cmpname | |
def convert(filename, mapping, args): | |
parsed = dendropy.DnaCharacterMatrix.get_from_path(filename, "fasta") | |
seen = set() | |
final = dendropy.DnaCharacterMatrix() | |
for key in parsed.iterkeys(): | |
canon_name = acc_fix(str(key), mapping) | |
if canon_name in seen: | |
print "Found duplicate |{0}| as |{1}| in {2}".format(canon_name, str(key), filename) | |
continue | |
seen.add(canon_name) | |
chars = StringIO(">" + str(key) + "\n" + str(parsed[key])) | |
final.extend(dendropy.DnaCharacterMatrix(stream=chars, label=key, schema="fasta")) | |
final.write_to_path("uniq/" + os.path.basename(filename), "fasta") | |
def main(): | |
args = get_args() | |
with open(args.map, "rb") as rfile: | |
reader = csv.reader(rfile) | |
mapping = dict() | |
for row in reader: | |
row = [x.replace(" ", "_") for x in row] | |
mapping[row[0]] = row[1] | |
pool = multiprocessing.Pool() | |
partial = functools.partial(convert, mapping=mapping, args=args) | |
pool.map(partial, args.files) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment