Skip to content

Instantly share code, notes, and snippets.

@jonchang
Created April 29, 2013 21:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jonchang/5485018 to your computer and use it in GitHub Desktop.
Save jonchang/5485018 to your computer and use it in GitHub Desktop.
canonizes synonyms and drops duplicates in alignment files
#!/usr/bin/env python
# canonize_synonyms.py -- by Jonathan Chang (March 2013)
import argparse
import os.path
import multiprocessing
import functools
import csv
import collections
from cStringIO import StringIO
import dendropy
def get_args():
parser = argparse.ArgumentParser(description="canonizes synonyms and drops duplicates in alignment files")
parser.add_argument("files", nargs="+", help="list of file(s) to convert")
required = parser.add_argument_group("required arguments")
required.add_argument("--map", help="csv file that has a map of synonym->real name", required=True)
return parser.parse_args()
def acc_fix(name, mapping):
splat = name.split("_")
cmpname = "_".join(splat[:-1])
if cmpname in mapping.iterkeys():
cmpname = mapping[cmpname]
return cmpname
def convert(filename, mapping, args):
parsed = dendropy.DnaCharacterMatrix.get_from_path(filename, "fasta")
seen = set()
final = dendropy.DnaCharacterMatrix()
for key in parsed.iterkeys():
canon_name = acc_fix(str(key), mapping)
if canon_name in seen:
print "Found duplicate |{0}| as |{1}| in {2}".format(canon_name, str(key), filename)
continue
seen.add(canon_name)
chars = StringIO(">" + str(key) + "\n" + str(parsed[key]))
final.extend(dendropy.DnaCharacterMatrix(stream=chars, label=key, schema="fasta"))
final.write_to_path("uniq/" + os.path.basename(filename), "fasta")
def main():
args = get_args()
with open(args.map, "rb") as rfile:
reader = csv.reader(rfile)
mapping = dict()
for row in reader:
row = [x.replace(" ", "_") for x in row]
mapping[row[0]] = row[1]
pool = multiprocessing.Pool()
partial = functools.partial(convert, mapping=mapping, args=args)
pool.map(partial, args.files)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment