Skip to content

Instantly share code, notes, and snippets.

@maclandrol
Created June 19, 2015 16:07
Show Gist options
  • Save maclandrol/35a9c7989b7cf3462cb3 to your computer and use it in GitHub Desktop.
Save maclandrol/35a9c7989b7cf3462cb3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import argparse
from glob import glob
import os
try:
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.Alphabet import generic_protein
except ImportError:
raise ValueError('Requirement not met!, Biopython not found, exiting')
def concatenateSequences(geneorder, speclist, alignment, missing='-', sep='#', gp=0, alpha=generic_protein):
"""Concatenate alignment"""
align_dict = {}
for gene in geneorder:
if gene in alignment.keys():
cur_al = alignment[gene]
al_len = cur_al.get_alignment_length()
spec_dict = dict((x.id.split(sep)[1-gp], x) for x in cur_al)
for spec in speclist :
# default , put missing data
adding = SeqRecord(Seq(missing*al_len, alpha))
if(spec in spec_dict.keys()):
adding = spec_dict[spec]
try :
align_dict[spec] += adding
except (KeyError, ValueError):
align_dict[spec] = adding
for spec in align_dict.keys():
align_dict[spec].id = spec
return MultipleSeqAlignment(align_dict.values())
if __name__ == '__main__':
seqformat = ['fasta', 'maf', 'clustal', 'phylip-relaxed', 'nexus', 'stockholm']
parser = argparse.ArgumentParser(description="concat - A script for sequence alignment concatenation.", epilog="See the whole program at https://github.com/maclandrol/pyphylo")
parser.add_argument("-wd", '--aligndir', required=True, dest='workdir', help="Working directory containing every alignment")
parser.add_argument("--species", "-s", type=argparse.FileType('r'), dest='species', help="Use only this species if provided")
parser.add_argument("--order", type=argparse.FileType('r'), help="concatenation order - default : alphabetic order")
parser.add_argument('-inf', "--inform", default='fasta', choices=seqformat, dest='inform', help="Alignment input format (each file should end by '.format'")
parser.add_argument('-outf', "--outform", default='phylip-relaxed', choices=seqformat, dest='outform', help="Output file in which the concatenated alignment should be written")
parser.add_argument('-o', "--output", default='concat', dest='output', help="Output file in which the concatenated alignment should be written")
parser.add_argument("-gp", '--genepos', dest='gp', type=int, default=0, help="Gene position according to the separator in each header, 1 (post) or 0(pre)")
parser.add_argument("--sep", default='#', help="Gene-specie separator : default '#'")
args = parser.parse_args()
align_glob = glob(args.workdir+"/*"+args.inform)
aligndict = {}
for path in align_glob:
if(os.path.isfile(path)):
c_basename = os.path.basename(path)
if(c_basename):
gene = c_basename.split('.')[0]
aligndict[gene] = AlignIO.read(path, args.inform)
# getting the geneorder
geneorder = []
if(args.order):
geneorder = [line.strip() for line in args.species if not line.startswith('#')]
else:
geneorder = sorted(aligndict.keys())
# getting the specielist
specielist = []
if(args.species):
specielist = [line.strip() for line in args.species if not line.startswith('#')]
else:
specielist = [x.id.split(args.sep)[1-args.gp] for align in aligndict.values() for x in align ]
specielist = list(set(specielist))
concat_align = concatenateSequences(geneorder, specielist, aligndict, sep=args.sep, gp=args.gp)
AlignIO.write(concat_align, os.path.join(args.workdir, args.output), format=args.outform)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment