adamewing/interspecies_pg_comparison.py

## interspecies_pg_comparison.py
#!/usr/bin/env python

import sys
import os
import argparse
import subprocess
import pysam
import tempfile
import multiprocessing as mp
import re
import itertools

from collections import defaultdict as dd
from uuid import uuid4
from time import sleep
from math import log

from Bio import Seq, SeqIO
from Bio.Alphabet import generic_dna
from Bio.Data import CodonTable

import numpy as np

import logging
FORMAT = '%(asctime)s %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class Block:
    def __init__(self, tstart, tend):
        self.tstart = min(int(tstart), int(tend))
        self.tend   = max(int(tstart), int(tend))

    def __len__(self):
        return self.tend - self.tstart

    def __str__(self):
        return str(self.tstart) + ' ' +  str(self.tend)

    def __lt__(self, other):
        return len(self) > len(other)


class PSL:
    def __init__(self, rec):
        # psl spec: http://www.ensembl.org/info/website/upload/psl.html
        (self.matches, self.misMatches, self.repMatches, self.nCount, self.qNumInsert, self.qBaseInsert,
         self.tNumInsert, self.tBaseInsert, self.strand, self.qName, self.qSize, self.qStart, self.qEnd,
         self.tName, self.tSize, self.tStart, self.tEnd, self.blockCount, self.blockSizes, self.qStarts,
         self.tStarts) = rec.strip().split()

        self.tBlocks = []
        for bsize, tstart in zip(self.blockSizes.split(',')[:-1], self.tStarts.split(',')[:-1]): # [:-1] due to trailing comma
            self.tBlocks.append(Block(int(tstart), int(tstart) + int(bsize)))

        self.tBlocks.sort()

        self.tStart, self.tEnd, self.qStart, self.qEnd = map(int, (self.tStart, self.tEnd, self.qStart, self.qEnd))
        self.qSize, self.tSize = map(int, (self.qSize, self.tSize))

        if self.qStart > self.qEnd:
            self.qStart, self.qEnd = self.qEnd, self.qStart

        if self.tStart > self.tEnd:
            self.tStart, self.tEnd = self.tEnd, self.tStart

    def match(self, chrom, pos, window=0):
        ''' return True if chrom:pos intersects BLAT hit +/- window '''
        chrom = chrom.replace('chr', '')
        if chrom != self.tName:
            return False

        if int(pos) >= int(self.tStart)-window and int(pos) <= int(self.tEnd)+window:
            return True

        return False

    def refspan(self):
        ''' return footprint of match relative to referece genome '''
        return self.tEnd - self.tStart

    def score(self):
        ''' adapted from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 '''
        return (int(self.matches) + (int(self.repMatches)>>1)) - int(self.misMatches) - int(self.qNumInsert) - int(self.tNumInsert)

    def pctmatch(self):
        ''' adapted from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 '''
        qAliSize = int(self.qEnd) - int(self.qStart)
        tAliSize = int(self.tEnd) - int(self.tStart)
        if min(qAliSize, tAliSize) <= 0:
            return 0.0

        sizeDif = abs(qAliSize - tAliSize)
        total = int(self.matches) + int(self.repMatches) + int(self.misMatches)

        if total > 0:
            return 1.0-float((int(self.misMatches) + int(self.qNumInsert) + int(self.tNumInsert) + round(3*log(1+sizeDif)))) / float(total)

        return 0.0

    def sumblocks(self):
        return sum([len(b) for b in self.tBlocks])

    def __lt__(self, other):
        ''' used for ranking BLAT hits '''
        return self.score() > other.score()


def rc(dna):
    ''' reverse complement '''
    complements = str.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
    return dna.translate(complements)[::-1]


def exonerate_align(qryseq, refseq, cdna=False, tmpdir='/tmp', minmatch=80.0):
    rnd = str(uuid4())
    tgtfa = tmpdir + '/tmp.' + rnd + '.tgt.fa'
    qryfa = tmpdir + '/tmp.' + rnd + '.qry.fa'

    tgt = open(tgtfa, 'w')
    qry = open(qryfa, 'w')

    tgt.write('>ref' + '\n' + refseq + '\n')
    qry.write('>qry' + '\n' + qryseq + '\n')

    tgt.close()
    qry.close()

    cmd = ['exonerate', '--bestn', '1', '-m', 'ungapped', '--showalignment', '1', '--ryo', 'ALN' + '\t%s\t%qab\t%qae\t%tab\t%tae\t%pi\t%qS\t%tS\t%C\n', qryfa, tgtfa]
    if cdna:
        cmd = ['exonerate', '--bestn', '1', '-m', 'cdna2genome', '--showalignment', '1', '--ryo', 'ALN' + '\t%s\t%qab\t%qae\t%tab\t%tae\t%pi\t%qS\t%tS\t%C\n', qryfa, tgtfa]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    best = []
    topscore = 0

    for pline in p.stdout.readlines():
        pline = pline.decode()
        if pline.startswith('ALN'):
            c = pline.strip().split()
            if int(c[1]) > topscore and float(c[6]) >= minmatch:
                topscore = int(c[1])
                best = c

    os.remove(tgtfa)
    os.remove(qryfa)

    return best


def blat(seq, port, minScore=0, maxIntron=None):
    ''' BLAT using gfClient utility '''

    fasta_tmp_fn = str(uuid4()) + '.fa'
    with open(fasta_tmp_fn, 'w') as fasta_tmp:
        fasta_tmp.write('>query\n%s\n' % seq)

    cmd  = ['gfClient', 'localhost', str(port), '-nohead']

    if maxIntron is not None:
        cmd.append('-maxIntron=' + str(maxIntron))

    if minScore is not None:
        cmd.append('-minScore=' + str(minScore))

    psl_tmp_fn = str(uuid4()) + '.psl'

    cmd += ['.', fasta_tmp_fn, psl_tmp_fn]

    FNULL = open(os.devnull, 'w')

    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=FNULL)

    result = []

    for out_line in p.stdout:
        out_line = out_line.decode()

    with open(psl_tmp_fn) as psl:
        for rec in psl:
            result.append(PSL(rec))

    os.remove(psl_tmp_fn)
    os.remove(fasta_tmp_fn)

    return sorted(result)


def write_codeml_ctl(name):
    with open(name + '.codeml.ctl', 'w') as out:
        out.write('seqfile = %s.pal2nal\n' % name)
        out.write('outfile = %s.codeml.txt\n' % name)
        out.write('noisy = 0\n')
        out.write('verbose = 0\n')
        out.write('runmode = -2\n')
        out.write('seqtype = 1\n')
        out.write('CodonFreq = 2\n')
        out.write('model = 1\n')
        out.write('NSsites = 0\n')
        out.write('icode = 0\n')
        out.write('fix_kappa = 1\n')
        out.write('kappa = 1\n')
        out.write('fix_omega = 0\n')
        out.write('omega = 0.5\n')

    return name + '.codeml.ctl', name + '.codeml.txt'


def dnds(aa_fa, nuc_fa, name):
    FNULL = open(os.devnull, 'w')

    codeml_ctl, codeml_out = write_codeml_ctl(name)

    aln_out = name + '.aa.aln.fa'
    clustalo_cmd = ['clustalo', '-i', aa_fa, '-o', aln_out, '--force']

    p = subprocess.Popen(clustalo_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    for line in p.stdout:
        line = line.decode()
        logger.debug('clustalo stdout: %s' % line.strip())

    for line in p.stderr:
        line = line.decode()
        logger.debug('clustalo stderr: %s' % line.strip())

    pal2nal_out = name + '.pal2nal'
    pal2nal_cmd = ['pal2nal.pl', aln_out, nuc_fa, '-output', 'paml', '-nogap']

    p = subprocess.Popen(pal2nal_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    with open(pal2nal_out, 'w') as p2n_out:
        for line in p.stdout:
            line = line.decode()
            p2n_out.write(line)

    p = subprocess.Popen(['codeml', codeml_ctl], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    for line in p.stdout:
        line = line.decode()
        logger.debug('codeml stdout: %s' % line.strip())

    for line in p.stderr:
        line = line.decode()
        logger.debug('codeml stderr: %s' % line.strip())
        pass

    result = {}

    with open(codeml_out) as cml:
        pairwise_section = False
        pair = None

        for line in cml:
            if line.startswith('pairwise comparison, codon frequencies'):
                pairwise_section = True
                continue

            if pairwise_section:
                if line.strip() == "":
                    continue
                if pair is None:
                    sp1, sp2 = line.strip().split(' ... ')
                    sp1 = sp1.split('(')[-1].rstrip(')')
                    sp2 = sp2.split('(')[-1].rstrip(')')
                    sp1, sp2 = sorted((sp1, sp2))
                    pair = 'dNdS_%s_%s' % (sp1, sp2)

                elif line.startswith('t='):
                    assert pair is not None
                    line = line.strip()
                    line = re.sub('=\s+', '=', line)
                    line = re.sub('\s+=', '=', line)

                    vars = dict(v.split('=') for v in line.split())

                    for k,v in vars.items():
                        vars[k] = float(v)


                    if vars['dS'] < 2 and vars['dS'] > 0.01 and vars['dN/dS'] < 10:
                        result[pair] = vars

                    pair = None

    return result


def main(args):
    trans_table = CodonTable.unambiguous_dna_by_name["Standard"]

    ports = {}
    ref = {}

    with open(args.ports) as _:
        for line in _:
            name, port, fa = line.strip().split()
            ports[name] = port
            ref[name] = pysam.Fastafile(fa)

    pool = mp.Pool(processes=len(ports))

    header = ['gene', 'tx', 'tx_len', 'padding']

    subfields = ['_loc', '_score', '_pctmatch', '_matchlen', '_cDNAlen', '_cDNAmatch', '_ORFmaxlen', '_ORFstops', '_ORFmatch']

    species = []

    for s in ports:
        header += [s+sf for sf in subfields]
        species.append(s)

    header.append('median_dNdS')

    sp_pairs = {}

    for s1, s2 in itertools.product(species, species):
        if s1 == s2:
            continue

        s1, s2 = sorted((s1, s2))
        pair = 'dNdS_%s_%s' % (s1, s2)

        if pair not in sp_pairs:
            sp_pairs[pair] = 'NA'

    header += list(sp_pairs.keys())

    print(','.join(header))

    tx_seq  = {}
    tx_info = {}
    tx_cds  = dd(list)

    with open(args.gff) as gff:
        for line in gff:
            if line.startswith('#'):
                continue

            chrom, source, feature, start, end, score, strand, frame, attribs = line.split('\t')

            attr_dict = {}

            for attrib in attribs.split(';'):
                if attrib.strip().split():
                    key, val = attrib.strip().split()[:2]
                    key = key.strip()
                    val = val.strip().strip('"')
                    attr_dict[key] = val

            if not chrom.startswith('chr'):
                chrom = 'chr' + chrom

            if chrom not in ref['human'].references:
                logger.warning('chromosome not found: %s' % chrom)
                continue

            start = int(start)
            end   = int(end)
            gene  = attr_dict['gene_id']
            tx_id = attr_dict['transcript_id']

            if feature == 'transcript':
                f = int(args.flank)
                tx_seq[tx_id] = ref['human'].fetch(chrom, start-f, end+f)
                tx_info[tx_id] = [gene, tx_id, str(end-start), str(args.flank)]

            if feature == 'CDS':
                tx_cds[tx_id].append((chrom, start, end, strand))


    tx_cdna = dd(str)

    for tx_id in tx_cds:
        for cds in tx_cds[tx_id]:
            chrom, start, end, strand = cds
            seq = ref['human'].fetch(chrom, start-1, end)

            if strand == '-':
                seq = rc(seq)

            tx_cdna[tx_id] += seq

    if not os.path.exists('seqout'):
        os.mkdir('seqout')

    orf_disrupted = 0
    orf_noalign   = 0
    cdna_noalign  = 0
    aa_out_count  = 0

    for tx_id in tx_cds:
        results = []

        output = True

        uniq_name = '%s_%s' % (tx_info[tx_id][0], tx_info[tx_id][1])
        cdna_fn   = 'seqout/%s_%s.cDNA.fa' % (tx_info[tx_id][0], tx_info[tx_id][1])
        aa_fn     = 'seqout/%s_%s.aa.fa' % (tx_info[tx_id][0], tx_info[tx_id][1])

        cdna_fh = open(cdna_fn, 'w')
        aa_fh   = open(aa_fn, 'w')

        for s in ports:
            res = pool.apply_async(blat, [tx_seq[tx_id], ports[s]])
            results.append((s, res))

        out = dd(list)

        for s, res in results:
            aln = res.get()

            if len(aln) == 0:
                out[s] = ('N/A','0','0','0','0','0','0','0')

            best = aln[0]
            loc = '%s:%d-%d' % (best.tName, best.tStart, best.tEnd)

            region_seq = ref[s].fetch(best.tName, best.tStart, best.tEnd).upper()

            cdna_aln = exonerate_align(tx_cdna[tx_id], region_seq, cdna=True)

            if s == 'human' and len(cdna_aln) == 0:
                output = False
                cdna_noalign =+ 1

            out[s].append(loc)
            out[s].append('%d' % best.score())
            out[s].append('%.2f' % best.pctmatch())
            out[s].append('%d' % best.refspan())

            orf_aa = None

            if cdna_aln:
                cdna_start = int(cdna_aln[4])
                cdna_end = int(cdna_aln[5])
                cdna_str = cdna_aln[7]
                ref_str  = cdna_aln[8]

                if ref_str == '-':
                    cdna_start, cdna_end = cdna_end, cdna_start

                cdna_seq = region_seq[cdna_start:cdna_end]

                cigar_ops = ''

                op = None

                for i, cig in enumerate(cdna_aln[9:], 1):
                    if i % 2 == 0:
                        assert op is not None
                        if op in ('M', 'D'):
                            cigar_ops += op * int(cig)
                        op = None
                    else:
                        op = cig

                assert len(cigar_ops) == len(cdna_seq), '%d vs %d' % (len(cigar_ops), len(cdna_seq))

                new_cdna = []
                for b,c in zip(list(cdna_seq), list(cigar_ops)):
                    if c == 'M':
                        new_cdna.append(b)

                out[s].append(str(len(new_cdna))) # _cDNAlen
                out[s].append(cdna_aln[6])        # _cDNAmatch

                cdna_seq = ''.join(new_cdna)

                if cdna_str != ref_str:
                    cdna_seq = rc(cdna_seq)

                codon_intact = False

                if len(cdna_seq) % 3 == 0:
                    codon_intact = True


                if codon_intact:
                    tx_obj = Seq.Seq(tx_cdna[tx_id], generic_dna)
                    cdna_obj = Seq.Seq(cdna_seq, generic_dna)

                    tx_aa = str(tx_obj.translate(trans_table))
                    orf_aa = str(cdna_obj.translate(trans_table))

                    aa_align = exonerate_align(tx_aa, orf_aa)
                    orf_match = '0.0'
                    orf_len = 0

                    if aa_align:
                        aa_fh.write('>%s\n%s\n' % (s, orf_aa))
                        cdna_fh.write('>%s\n%s\n' % (s, cdna_seq))
                        aa_out_count += 1
                        orf_frags = sorted([len(aa) for aa in orf_aa.split('*')], reverse=True)
                        orf_len = str(orf_frags[0])        # _ORFmaxlen
                        orf_stops = str(orf_aa.count('*')) # _ORFstops
                        orf_match = aa_align[6]            # _ORFmatch

                    else:
                        if s == 'human':
                            orf_noalign += 1
                            output = False

                    out[s].append(str(orf_len))   # _ORFmaxlen
                    out[s].append(str(orf_stops)) # _ORFstops
                    out[s].append(orf_match)      # _ORFmatch
                else:
                    out[s].append('0')   # _ORFmaxlen
                    out[s].append('0')   # _ORFstops
                    out[s].append('0.0') # _ORFmatch
                    if s == 'human':
                        orf_disrupted += 1
                        output = False

            else:
                out[s].append('0')     # _cDNAlen
                out[s].append('0.0')   # _cDNAmatch
                out[s].append('0')     # _ORFmaxlen
                out[s].append('0')     # _ORFstops
                out[s].append('0.0')   # _ORFmatch

        cdna_fh.close()
        aa_fh.close()

        if output:
            # reset dnds table
            for pair in sp_pairs:
                sp_pairs[pair] = 'NA'

            median_dnds = 'NA'
            dnds_list = []

            if aa_out_count > 1:
                codeml_res = dnds(aa_fn, cdna_fn, uniq_name)

                for pair in codeml_res:
                    assert pair in sp_pairs
                    pw_dnds = codeml_res[pair]['dN/dS']
                    dnds_list.append(pw_dnds)
                    sp_pairs[pair] = str(pw_dnds)

            outline = tx_info[tx_id]

            for s in ports:
                outline.append(','.join(out[s]))

            if len(dnds_list) > 0:
                median_dnds = '%.4f' % np.median(dnds_list)

            outline.append(median_dnds)

            for p in sp_pairs:
                outline.append(sp_pairs[p])

            print(','.join(outline))

        if not output:
            os.remove(cdna_fn)
            os.remove(aa_fn)

    logger.info('human cdna noalign: %d' % cdna_noalign)
    logger.info('human orf noalign: %d' % orf_noalign)
    logger.info('human orf disrupted: %d' % orf_disrupted)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='start BLAT servers via: gfServer start localhost <portnumber> -stepSize=5 path/to.2bit')
    parser.add_argument('--gff', required=True, help='gff, requires gene_id and transcript_id attributes. Should include transcript and CDS entries at a minimum.')
    parser.add_argument('--flank', default=1000, help='flank for initial positioning alignments, default = 1000')
    parser.add_argument('--ports', required=True, help='space-delimited .txt file: name port indexed.fa')
    args = parser.parse_args()
    main(args)
	#!/usr/bin/env python

	import sys
	import os
	import argparse
	import subprocess
	import pysam
	import tempfile
	import multiprocessing as mp
	import re
	import itertools

	from collections import defaultdict as dd
	from uuid import uuid4
	from time import sleep
	from math import log

	from Bio import Seq, SeqIO
	from Bio.Alphabet import generic_dna
	from Bio.Data import CodonTable

	import numpy as np

	import logging
	FORMAT = '%(asctime)s %(message)s'
	logging.basicConfig(format=FORMAT)
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)


	class Block:
	def __init__(self, tstart, tend):
	self.tstart = min(int(tstart), int(tend))
	self.tend = max(int(tstart), int(tend))

	def __len__(self):
	return self.tend - self.tstart

	def __str__(self):
	return str(self.tstart) + ' ' + str(self.tend)

	def __lt__(self, other):
	return len(self) > len(other)


	class PSL:
	def __init__(self, rec):
	# psl spec: http://www.ensembl.org/info/website/upload/psl.html
	(self.matches, self.misMatches, self.repMatches, self.nCount, self.qNumInsert, self.qBaseInsert,
	self.tNumInsert, self.tBaseInsert, self.strand, self.qName, self.qSize, self.qStart, self.qEnd,
	self.tName, self.tSize, self.tStart, self.tEnd, self.blockCount, self.blockSizes, self.qStarts,
	self.tStarts) = rec.strip().split()

	self.tBlocks = []
	for bsize, tstart in zip(self.blockSizes.split(',')[:-1], self.tStarts.split(',')[:-1]): # [:-1] due to trailing comma
	self.tBlocks.append(Block(int(tstart), int(tstart) + int(bsize)))

	self.tBlocks.sort()

	self.tStart, self.tEnd, self.qStart, self.qEnd = map(int, (self.tStart, self.tEnd, self.qStart, self.qEnd))
	self.qSize, self.tSize = map(int, (self.qSize, self.tSize))

	if self.qStart > self.qEnd:
	self.qStart, self.qEnd = self.qEnd, self.qStart

	if self.tStart > self.tEnd:
	self.tStart, self.tEnd = self.tEnd, self.tStart

	def match(self, chrom, pos, window=0):
	''' return True if chrom:pos intersects BLAT hit +/- window '''
	chrom = chrom.replace('chr', '')
	if chrom != self.tName:
	return False

	if int(pos) >= int(self.tStart)-window and int(pos) <= int(self.tEnd)+window:
	return True

	return False

	def refspan(self):
	''' return footprint of match relative to referece genome '''
	return self.tEnd - self.tStart

	def score(self):
	''' adapted from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 '''
	return (int(self.matches) + (int(self.repMatches)>>1)) - int(self.misMatches) - int(self.qNumInsert) - int(self.tNumInsert)

	def pctmatch(self):
	''' adapted from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 '''
	qAliSize = int(self.qEnd) - int(self.qStart)
	tAliSize = int(self.tEnd) - int(self.tStart)
	if min(qAliSize, tAliSize) <= 0:
	return 0.0

	sizeDif = abs(qAliSize - tAliSize)
	total = int(self.matches) + int(self.repMatches) + int(self.misMatches)

	if total > 0:
	return 1.0-float((int(self.misMatches) + int(self.qNumInsert) + int(self.tNumInsert) + round(3*log(1+sizeDif)))) / float(total)

	return 0.0

	def sumblocks(self):
	return sum([len(b) for b in self.tBlocks])

	def __lt__(self, other):
	''' used for ranking BLAT hits '''
	return self.score() > other.score()


	def rc(dna):
	''' reverse complement '''
	complements = str.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
	return dna.translate(complements)[::-1]


	def exonerate_align(qryseq, refseq, cdna=False, tmpdir='/tmp', minmatch=80.0):
	rnd = str(uuid4())
	tgtfa = tmpdir + '/tmp.' + rnd + '.tgt.fa'
	qryfa = tmpdir + '/tmp.' + rnd + '.qry.fa'

	tgt = open(tgtfa, 'w')
	qry = open(qryfa, 'w')

	tgt.write('>ref' + '\n' + refseq + '\n')
	qry.write('>qry' + '\n' + qryseq + '\n')

	tgt.close()
	qry.close()

	cmd = ['exonerate', '--bestn', '1', '-m', 'ungapped', '--showalignment', '1', '--ryo', 'ALN' + '\t%s\t%qab\t%qae\t%tab\t%tae\t%pi\t%qS\t%tS\t%C\n', qryfa, tgtfa]
	if cdna:
	cmd = ['exonerate', '--bestn', '1', '-m', 'cdna2genome', '--showalignment', '1', '--ryo', 'ALN' + '\t%s\t%qab\t%qae\t%tab\t%tae\t%pi\t%qS\t%tS\t%C\n', qryfa, tgtfa]
	p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	best = []
	topscore = 0

	for pline in p.stdout.readlines():
	pline = pline.decode()
	if pline.startswith('ALN'):
	c = pline.strip().split()
	if int(c[1]) > topscore and float(c[6]) >= minmatch:
	topscore = int(c[1])
	best = c

	os.remove(tgtfa)
	os.remove(qryfa)

	return best


	def blat(seq, port, minScore=0, maxIntron=None):
	''' BLAT using gfClient utility '''

	fasta_tmp_fn = str(uuid4()) + '.fa'
	with open(fasta_tmp_fn, 'w') as fasta_tmp:
	fasta_tmp.write('>query\n%s\n' % seq)

	cmd = ['gfClient', 'localhost', str(port), '-nohead']

	if maxIntron is not None:
	cmd.append('-maxIntron=' + str(maxIntron))

	if minScore is not None:
	cmd.append('-minScore=' + str(minScore))

	psl_tmp_fn = str(uuid4()) + '.psl'

	cmd += ['.', fasta_tmp_fn, psl_tmp_fn]

	FNULL = open(os.devnull, 'w')

	p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=FNULL)

	result = []

	for out_line in p.stdout:
	out_line = out_line.decode()

	with open(psl_tmp_fn) as psl:
	for rec in psl:
	result.append(PSL(rec))

	os.remove(psl_tmp_fn)
	os.remove(fasta_tmp_fn)

	return sorted(result)


	def write_codeml_ctl(name):
	with open(name + '.codeml.ctl', 'w') as out:
	out.write('seqfile = %s.pal2nal\n' % name)
	out.write('outfile = %s.codeml.txt\n' % name)
	out.write('noisy = 0\n')
	out.write('verbose = 0\n')
	out.write('runmode = -2\n')
	out.write('seqtype = 1\n')
	out.write('CodonFreq = 2\n')
	out.write('model = 1\n')
	out.write('NSsites = 0\n')
	out.write('icode = 0\n')
	out.write('fix_kappa = 1\n')
	out.write('kappa = 1\n')
	out.write('fix_omega = 0\n')
	out.write('omega = 0.5\n')

	return name + '.codeml.ctl', name + '.codeml.txt'


	def dnds(aa_fa, nuc_fa, name):
	FNULL = open(os.devnull, 'w')

	codeml_ctl, codeml_out = write_codeml_ctl(name)

	aln_out = name + '.aa.aln.fa'
	clustalo_cmd = ['clustalo', '-i', aa_fa, '-o', aln_out, '--force']

	p = subprocess.Popen(clustalo_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	for line in p.stdout:
	line = line.decode()
	logger.debug('clustalo stdout: %s' % line.strip())

	for line in p.stderr:
	line = line.decode()
	logger.debug('clustalo stderr: %s' % line.strip())

	pal2nal_out = name + '.pal2nal'
	pal2nal_cmd = ['pal2nal.pl', aln_out, nuc_fa, '-output', 'paml', '-nogap']

	p = subprocess.Popen(pal2nal_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	with open(pal2nal_out, 'w') as p2n_out:
	for line in p.stdout:
	line = line.decode()
	p2n_out.write(line)

	p = subprocess.Popen(['codeml', codeml_ctl], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	for line in p.stdout:
	line = line.decode()
	logger.debug('codeml stdout: %s' % line.strip())

	for line in p.stderr:
	line = line.decode()
	logger.debug('codeml stderr: %s' % line.strip())
	pass

	result = {}

	with open(codeml_out) as cml:
	pairwise_section = False
	pair = None

	for line in cml:
	if line.startswith('pairwise comparison, codon frequencies'):
	pairwise_section = True
	continue

	if pairwise_section:
	if line.strip() == "":
	continue
	if pair is None:
	sp1, sp2 = line.strip().split(' ... ')
	sp1 = sp1.split('(')[-1].rstrip(')')
	sp2 = sp2.split('(')[-1].rstrip(')')
	sp1, sp2 = sorted((sp1, sp2))
	pair = 'dNdS_%s_%s' % (sp1, sp2)

	elif line.startswith('t='):
	assert pair is not None
	line = line.strip()
	line = re.sub('=\s+', '=', line)
	line = re.sub('\s+=', '=', line)

	vars = dict(v.split('=') for v in line.split())

	for k,v in vars.items():
	vars[k] = float(v)


	if vars['dS'] < 2 and vars['dS'] > 0.01 and vars['dN/dS'] < 10:
	result[pair] = vars

	pair = None

	return result


	def main(args):
	trans_table = CodonTable.unambiguous_dna_by_name["Standard"]

	ports = {}
	ref = {}

	with open(args.ports) as _:
	for line in _:
	name, port, fa = line.strip().split()
	ports[name] = port
	ref[name] = pysam.Fastafile(fa)

	pool = mp.Pool(processes=len(ports))

	header = ['gene', 'tx', 'tx_len', 'padding']

	subfields = ['_loc', '_score', '_pctmatch', '_matchlen', '_cDNAlen', '_cDNAmatch', '_ORFmaxlen', '_ORFstops', '_ORFmatch']

	species = []

	for s in ports:
	header += [s+sf for sf in subfields]
	species.append(s)

	header.append('median_dNdS')

	sp_pairs = {}

	for s1, s2 in itertools.product(species, species):
	if s1 == s2:
	continue

	s1, s2 = sorted((s1, s2))
	pair = 'dNdS_%s_%s' % (s1, s2)

	if pair not in sp_pairs:
	sp_pairs[pair] = 'NA'

	header += list(sp_pairs.keys())

	print(','.join(header))

	tx_seq = {}
	tx_info = {}
	tx_cds = dd(list)

	with open(args.gff) as gff:
	for line in gff:
	if line.startswith('#'):
	continue

	chrom, source, feature, start, end, score, strand, frame, attribs = line.split('\t')

	attr_dict = {}

	for attrib in attribs.split(';'):
	if attrib.strip().split():
	key, val = attrib.strip().split()[:2]
	key = key.strip()
	val = val.strip().strip('"')
	attr_dict[key] = val

	if not chrom.startswith('chr'):
	chrom = 'chr' + chrom

	if chrom not in ref['human'].references:
	logger.warning('chromosome not found: %s' % chrom)
	continue

	start = int(start)
	end = int(end)
	gene = attr_dict['gene_id']
	tx_id = attr_dict['transcript_id']

	if feature == 'transcript':
	f = int(args.flank)
	tx_seq[tx_id] = ref['human'].fetch(chrom, start-f, end+f)
	tx_info[tx_id] = [gene, tx_id, str(end-start), str(args.flank)]

	if feature == 'CDS':
	tx_cds[tx_id].append((chrom, start, end, strand))


	tx_cdna = dd(str)

	for tx_id in tx_cds:
	for cds in tx_cds[tx_id]:
	chrom, start, end, strand = cds
	seq = ref['human'].fetch(chrom, start-1, end)

	if strand == '-':
	seq = rc(seq)

	tx_cdna[tx_id] += seq

	if not os.path.exists('seqout'):
	os.mkdir('seqout')

	orf_disrupted = 0
	orf_noalign = 0
	cdna_noalign = 0
	aa_out_count = 0

	for tx_id in tx_cds:
	results = []

	output = True

	uniq_name = '%s_%s' % (tx_info[tx_id][0], tx_info[tx_id][1])
	cdna_fn = 'seqout/%s_%s.cDNA.fa' % (tx_info[tx_id][0], tx_info[tx_id][1])
	aa_fn = 'seqout/%s_%s.aa.fa' % (tx_info[tx_id][0], tx_info[tx_id][1])

	cdna_fh = open(cdna_fn, 'w')
	aa_fh = open(aa_fn, 'w')

	for s in ports:
	res = pool.apply_async(blat, [tx_seq[tx_id], ports[s]])
	results.append((s, res))

	out = dd(list)

	for s, res in results:
	aln = res.get()

	if len(aln) == 0:
	out[s] = ('N/A','0','0','0','0','0','0','0')

	best = aln[0]
	loc = '%s:%d-%d' % (best.tName, best.tStart, best.tEnd)

	region_seq = ref[s].fetch(best.tName, best.tStart, best.tEnd).upper()

	cdna_aln = exonerate_align(tx_cdna[tx_id], region_seq, cdna=True)

	if s == 'human' and len(cdna_aln) == 0:
	output = False
	cdna_noalign =+ 1

	out[s].append(loc)
	out[s].append('%d' % best.score())
	out[s].append('%.2f' % best.pctmatch())
	out[s].append('%d' % best.refspan())

	orf_aa = None

	if cdna_aln:
	cdna_start = int(cdna_aln[4])
	cdna_end = int(cdna_aln[5])
	cdna_str = cdna_aln[7]
	ref_str = cdna_aln[8]

	if ref_str == '-':
	cdna_start, cdna_end = cdna_end, cdna_start

	cdna_seq = region_seq[cdna_start:cdna_end]

	cigar_ops = ''

	op = None

	for i, cig in enumerate(cdna_aln[9:], 1):
	if i % 2 == 0:
	assert op is not None
	if op in ('M', 'D'):
	cigar_ops += op * int(cig)
	op = None
	else:
	op = cig

	assert len(cigar_ops) == len(cdna_seq), '%d vs %d' % (len(cigar_ops), len(cdna_seq))

	new_cdna = []
	for b,c in zip(list(cdna_seq), list(cigar_ops)):
	if c == 'M':
	new_cdna.append(b)

	out[s].append(str(len(new_cdna))) # _cDNAlen
	out[s].append(cdna_aln[6]) # _cDNAmatch

	cdna_seq = ''.join(new_cdna)

	if cdna_str != ref_str:
	cdna_seq = rc(cdna_seq)

	codon_intact = False

	if len(cdna_seq) % 3 == 0:
	codon_intact = True


	if codon_intact:
	tx_obj = Seq.Seq(tx_cdna[tx_id], generic_dna)
	cdna_obj = Seq.Seq(cdna_seq, generic_dna)

	tx_aa = str(tx_obj.translate(trans_table))
	orf_aa = str(cdna_obj.translate(trans_table))

	aa_align = exonerate_align(tx_aa, orf_aa)
	orf_match = '0.0'
	orf_len = 0

	if aa_align:
	aa_fh.write('>%s\n%s\n' % (s, orf_aa))
	cdna_fh.write('>%s\n%s\n' % (s, cdna_seq))
	aa_out_count += 1
	orf_frags = sorted([len(aa) for aa in orf_aa.split('*')], reverse=True)
	orf_len = str(orf_frags[0]) # _ORFmaxlen
	orf_stops = str(orf_aa.count('*')) # _ORFstops
	orf_match = aa_align[6] # _ORFmatch

	else:
	if s == 'human':
	orf_noalign += 1
	output = False

	out[s].append(str(orf_len)) # _ORFmaxlen
	out[s].append(str(orf_stops)) # _ORFstops
	out[s].append(orf_match) # _ORFmatch
	else:
	out[s].append('0') # _ORFmaxlen
	out[s].append('0') # _ORFstops
	out[s].append('0.0') # _ORFmatch
	if s == 'human':
	orf_disrupted += 1
	output = False

	else:
	out[s].append('0') # _cDNAlen
	out[s].append('0.0') # _cDNAmatch
	out[s].append('0') # _ORFmaxlen
	out[s].append('0') # _ORFstops
	out[s].append('0.0') # _ORFmatch

	cdna_fh.close()
	aa_fh.close()

	if output:
	# reset dnds table
	for pair in sp_pairs:
	sp_pairs[pair] = 'NA'

	median_dnds = 'NA'
	dnds_list = []

	if aa_out_count > 1:
	codeml_res = dnds(aa_fn, cdna_fn, uniq_name)

	for pair in codeml_res:
	assert pair in sp_pairs
	pw_dnds = codeml_res[pair]['dN/dS']
	dnds_list.append(pw_dnds)
	sp_pairs[pair] = str(pw_dnds)

	outline = tx_info[tx_id]

	for s in ports:
	outline.append(','.join(out[s]))

	if len(dnds_list) > 0:
	median_dnds = '%.4f' % np.median(dnds_list)

	outline.append(median_dnds)

	for p in sp_pairs:
	outline.append(sp_pairs[p])

	print(','.join(outline))

	if not output:
	os.remove(cdna_fn)
	os.remove(aa_fn)

	logger.info('human cdna noalign: %d' % cdna_noalign)
	logger.info('human orf noalign: %d' % orf_noalign)
	logger.info('human orf disrupted: %d' % orf_disrupted)

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='start BLAT servers via: gfServer start localhost <portnumber> -stepSize=5 path/to.2bit')
	parser.add_argument('--gff', required=True, help='gff, requires gene_id and transcript_id attributes. Should include transcript and CDS entries at a minimum.')
	parser.add_argument('--flank', default=1000, help='flank for initial positioning alignments, default = 1000')
	parser.add_argument('--ports', required=True, help='space-delimited .txt file: name port indexed.fa')
	args = parser.parse_args()
	main(args)