Last active
March 15, 2021 06:40
-
-
Save adamewing/3a4cfa8eb1a333ee9c497538ce30b6db to your computer and use it in GitHub Desktop.
Script for conservation table for Troskie et al.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import os | |
import argparse | |
import subprocess | |
import pysam | |
import tempfile | |
import multiprocessing as mp | |
import re | |
import itertools | |
from collections import defaultdict as dd | |
from uuid import uuid4 | |
from time import sleep | |
from math import log | |
from Bio import Seq, SeqIO | |
from Bio.Alphabet import generic_dna | |
from Bio.Data import CodonTable | |
import numpy as np | |
import logging | |
FORMAT = '%(asctime)s %(message)s' | |
logging.basicConfig(format=FORMAT) | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
class Block: | |
def __init__(self, tstart, tend): | |
self.tstart = min(int(tstart), int(tend)) | |
self.tend = max(int(tstart), int(tend)) | |
def __len__(self): | |
return self.tend - self.tstart | |
def __str__(self): | |
return str(self.tstart) + ' ' + str(self.tend) | |
def __lt__(self, other): | |
return len(self) > len(other) | |
class PSL: | |
def __init__(self, rec): | |
# psl spec: http://www.ensembl.org/info/website/upload/psl.html | |
(self.matches, self.misMatches, self.repMatches, self.nCount, self.qNumInsert, self.qBaseInsert, | |
self.tNumInsert, self.tBaseInsert, self.strand, self.qName, self.qSize, self.qStart, self.qEnd, | |
self.tName, self.tSize, self.tStart, self.tEnd, self.blockCount, self.blockSizes, self.qStarts, | |
self.tStarts) = rec.strip().split() | |
self.tBlocks = [] | |
for bsize, tstart in zip(self.blockSizes.split(',')[:-1], self.tStarts.split(',')[:-1]): # [:-1] due to trailing comma | |
self.tBlocks.append(Block(int(tstart), int(tstart) + int(bsize))) | |
self.tBlocks.sort() | |
self.tStart, self.tEnd, self.qStart, self.qEnd = map(int, (self.tStart, self.tEnd, self.qStart, self.qEnd)) | |
self.qSize, self.tSize = map(int, (self.qSize, self.tSize)) | |
if self.qStart > self.qEnd: | |
self.qStart, self.qEnd = self.qEnd, self.qStart | |
if self.tStart > self.tEnd: | |
self.tStart, self.tEnd = self.tEnd, self.tStart | |
def match(self, chrom, pos, window=0): | |
''' return True if chrom:pos intersects BLAT hit +/- window ''' | |
chrom = chrom.replace('chr', '') | |
if chrom != self.tName: | |
return False | |
if int(pos) >= int(self.tStart)-window and int(pos) <= int(self.tEnd)+window: | |
return True | |
return False | |
def refspan(self): | |
''' return footprint of match relative to referece genome ''' | |
return self.tEnd - self.tStart | |
def score(self): | |
''' adapted from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 ''' | |
return (int(self.matches) + (int(self.repMatches)>>1)) - int(self.misMatches) - int(self.qNumInsert) - int(self.tNumInsert) | |
def pctmatch(self): | |
''' adapted from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 ''' | |
qAliSize = int(self.qEnd) - int(self.qStart) | |
tAliSize = int(self.tEnd) - int(self.tStart) | |
if min(qAliSize, tAliSize) <= 0: | |
return 0.0 | |
sizeDif = abs(qAliSize - tAliSize) | |
total = int(self.matches) + int(self.repMatches) + int(self.misMatches) | |
if total > 0: | |
return 1.0-float((int(self.misMatches) + int(self.qNumInsert) + int(self.tNumInsert) + round(3*log(1+sizeDif)))) / float(total) | |
return 0.0 | |
def sumblocks(self): | |
return sum([len(b) for b in self.tBlocks]) | |
def __lt__(self, other): | |
''' used for ranking BLAT hits ''' | |
return self.score() > other.score() | |
def rc(dna): | |
''' reverse complement ''' | |
complements = str.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB') | |
return dna.translate(complements)[::-1] | |
def exonerate_align(qryseq, refseq, cdna=False, tmpdir='/tmp', minmatch=80.0): | |
rnd = str(uuid4()) | |
tgtfa = tmpdir + '/tmp.' + rnd + '.tgt.fa' | |
qryfa = tmpdir + '/tmp.' + rnd + '.qry.fa' | |
tgt = open(tgtfa, 'w') | |
qry = open(qryfa, 'w') | |
tgt.write('>ref' + '\n' + refseq + '\n') | |
qry.write('>qry' + '\n' + qryseq + '\n') | |
tgt.close() | |
qry.close() | |
cmd = ['exonerate', '--bestn', '1', '-m', 'ungapped', '--showalignment', '1', '--ryo', 'ALN' + '\t%s\t%qab\t%qae\t%tab\t%tae\t%pi\t%qS\t%tS\t%C\n', qryfa, tgtfa] | |
if cdna: | |
cmd = ['exonerate', '--bestn', '1', '-m', 'cdna2genome', '--showalignment', '1', '--ryo', 'ALN' + '\t%s\t%qab\t%qae\t%tab\t%tae\t%pi\t%qS\t%tS\t%C\n', qryfa, tgtfa] | |
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
best = [] | |
topscore = 0 | |
for pline in p.stdout.readlines(): | |
pline = pline.decode() | |
if pline.startswith('ALN'): | |
c = pline.strip().split() | |
if int(c[1]) > topscore and float(c[6]) >= minmatch: | |
topscore = int(c[1]) | |
best = c | |
os.remove(tgtfa) | |
os.remove(qryfa) | |
return best | |
def blat(seq, port, minScore=0, maxIntron=None): | |
''' BLAT using gfClient utility ''' | |
fasta_tmp_fn = str(uuid4()) + '.fa' | |
with open(fasta_tmp_fn, 'w') as fasta_tmp: | |
fasta_tmp.write('>query\n%s\n' % seq) | |
cmd = ['gfClient', 'localhost', str(port), '-nohead'] | |
if maxIntron is not None: | |
cmd.append('-maxIntron=' + str(maxIntron)) | |
if minScore is not None: | |
cmd.append('-minScore=' + str(minScore)) | |
psl_tmp_fn = str(uuid4()) + '.psl' | |
cmd += ['.', fasta_tmp_fn, psl_tmp_fn] | |
FNULL = open(os.devnull, 'w') | |
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=FNULL) | |
result = [] | |
for out_line in p.stdout: | |
out_line = out_line.decode() | |
with open(psl_tmp_fn) as psl: | |
for rec in psl: | |
result.append(PSL(rec)) | |
os.remove(psl_tmp_fn) | |
os.remove(fasta_tmp_fn) | |
return sorted(result) | |
def write_codeml_ctl(name): | |
with open(name + '.codeml.ctl', 'w') as out: | |
out.write('seqfile = %s.pal2nal\n' % name) | |
out.write('outfile = %s.codeml.txt\n' % name) | |
out.write('noisy = 0\n') | |
out.write('verbose = 0\n') | |
out.write('runmode = -2\n') | |
out.write('seqtype = 1\n') | |
out.write('CodonFreq = 2\n') | |
out.write('model = 1\n') | |
out.write('NSsites = 0\n') | |
out.write('icode = 0\n') | |
out.write('fix_kappa = 1\n') | |
out.write('kappa = 1\n') | |
out.write('fix_omega = 0\n') | |
out.write('omega = 0.5\n') | |
return name + '.codeml.ctl', name + '.codeml.txt' | |
def dnds(aa_fa, nuc_fa, name): | |
FNULL = open(os.devnull, 'w') | |
codeml_ctl, codeml_out = write_codeml_ctl(name) | |
aln_out = name + '.aa.aln.fa' | |
clustalo_cmd = ['clustalo', '-i', aa_fa, '-o', aln_out, '--force'] | |
p = subprocess.Popen(clustalo_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
for line in p.stdout: | |
line = line.decode() | |
logger.debug('clustalo stdout: %s' % line.strip()) | |
for line in p.stderr: | |
line = line.decode() | |
logger.debug('clustalo stderr: %s' % line.strip()) | |
pal2nal_out = name + '.pal2nal' | |
pal2nal_cmd = ['pal2nal.pl', aln_out, nuc_fa, '-output', 'paml', '-nogap'] | |
p = subprocess.Popen(pal2nal_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
with open(pal2nal_out, 'w') as p2n_out: | |
for line in p.stdout: | |
line = line.decode() | |
p2n_out.write(line) | |
p = subprocess.Popen(['codeml', codeml_ctl], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
for line in p.stdout: | |
line = line.decode() | |
logger.debug('codeml stdout: %s' % line.strip()) | |
for line in p.stderr: | |
line = line.decode() | |
logger.debug('codeml stderr: %s' % line.strip()) | |
pass | |
result = {} | |
with open(codeml_out) as cml: | |
pairwise_section = False | |
pair = None | |
for line in cml: | |
if line.startswith('pairwise comparison, codon frequencies'): | |
pairwise_section = True | |
continue | |
if pairwise_section: | |
if line.strip() == "": | |
continue | |
if pair is None: | |
sp1, sp2 = line.strip().split(' ... ') | |
sp1 = sp1.split('(')[-1].rstrip(')') | |
sp2 = sp2.split('(')[-1].rstrip(')') | |
sp1, sp2 = sorted((sp1, sp2)) | |
pair = 'dNdS_%s_%s' % (sp1, sp2) | |
elif line.startswith('t='): | |
assert pair is not None | |
line = line.strip() | |
line = re.sub('=\s+', '=', line) | |
line = re.sub('\s+=', '=', line) | |
vars = dict(v.split('=') for v in line.split()) | |
for k,v in vars.items(): | |
vars[k] = float(v) | |
if vars['dS'] < 2 and vars['dS'] > 0.01 and vars['dN/dS'] < 10: | |
result[pair] = vars | |
pair = None | |
return result | |
def main(args): | |
trans_table = CodonTable.unambiguous_dna_by_name["Standard"] | |
ports = {} | |
ref = {} | |
with open(args.ports) as _: | |
for line in _: | |
name, port, fa = line.strip().split() | |
ports[name] = port | |
ref[name] = pysam.Fastafile(fa) | |
pool = mp.Pool(processes=len(ports)) | |
header = ['gene', 'tx', 'tx_len', 'padding'] | |
subfields = ['_loc', '_score', '_pctmatch', '_matchlen', '_cDNAlen', '_cDNAmatch', '_ORFmaxlen', '_ORFstops', '_ORFmatch'] | |
species = [] | |
for s in ports: | |
header += [s+sf for sf in subfields] | |
species.append(s) | |
header.append('median_dNdS') | |
sp_pairs = {} | |
for s1, s2 in itertools.product(species, species): | |
if s1 == s2: | |
continue | |
s1, s2 = sorted((s1, s2)) | |
pair = 'dNdS_%s_%s' % (s1, s2) | |
if pair not in sp_pairs: | |
sp_pairs[pair] = 'NA' | |
header += list(sp_pairs.keys()) | |
print(','.join(header)) | |
tx_seq = {} | |
tx_info = {} | |
tx_cds = dd(list) | |
with open(args.gff) as gff: | |
for line in gff: | |
if line.startswith('#'): | |
continue | |
chrom, source, feature, start, end, score, strand, frame, attribs = line.split('\t') | |
attr_dict = {} | |
for attrib in attribs.split(';'): | |
if attrib.strip().split(): | |
key, val = attrib.strip().split()[:2] | |
key = key.strip() | |
val = val.strip().strip('"') | |
attr_dict[key] = val | |
if not chrom.startswith('chr'): | |
chrom = 'chr' + chrom | |
if chrom not in ref['human'].references: | |
logger.warning('chromosome not found: %s' % chrom) | |
continue | |
start = int(start) | |
end = int(end) | |
gene = attr_dict['gene_id'] | |
tx_id = attr_dict['transcript_id'] | |
if feature == 'transcript': | |
f = int(args.flank) | |
tx_seq[tx_id] = ref['human'].fetch(chrom, start-f, end+f) | |
tx_info[tx_id] = [gene, tx_id, str(end-start), str(args.flank)] | |
if feature == 'CDS': | |
tx_cds[tx_id].append((chrom, start, end, strand)) | |
tx_cdna = dd(str) | |
for tx_id in tx_cds: | |
for cds in tx_cds[tx_id]: | |
chrom, start, end, strand = cds | |
seq = ref['human'].fetch(chrom, start-1, end) | |
if strand == '-': | |
seq = rc(seq) | |
tx_cdna[tx_id] += seq | |
if not os.path.exists('seqout'): | |
os.mkdir('seqout') | |
orf_disrupted = 0 | |
orf_noalign = 0 | |
cdna_noalign = 0 | |
aa_out_count = 0 | |
for tx_id in tx_cds: | |
results = [] | |
output = True | |
uniq_name = '%s_%s' % (tx_info[tx_id][0], tx_info[tx_id][1]) | |
cdna_fn = 'seqout/%s_%s.cDNA.fa' % (tx_info[tx_id][0], tx_info[tx_id][1]) | |
aa_fn = 'seqout/%s_%s.aa.fa' % (tx_info[tx_id][0], tx_info[tx_id][1]) | |
cdna_fh = open(cdna_fn, 'w') | |
aa_fh = open(aa_fn, 'w') | |
for s in ports: | |
res = pool.apply_async(blat, [tx_seq[tx_id], ports[s]]) | |
results.append((s, res)) | |
out = dd(list) | |
for s, res in results: | |
aln = res.get() | |
if len(aln) == 0: | |
out[s] = ('N/A','0','0','0','0','0','0','0') | |
best = aln[0] | |
loc = '%s:%d-%d' % (best.tName, best.tStart, best.tEnd) | |
region_seq = ref[s].fetch(best.tName, best.tStart, best.tEnd).upper() | |
cdna_aln = exonerate_align(tx_cdna[tx_id], region_seq, cdna=True) | |
if s == 'human' and len(cdna_aln) == 0: | |
output = False | |
cdna_noalign =+ 1 | |
out[s].append(loc) | |
out[s].append('%d' % best.score()) | |
out[s].append('%.2f' % best.pctmatch()) | |
out[s].append('%d' % best.refspan()) | |
orf_aa = None | |
if cdna_aln: | |
cdna_start = int(cdna_aln[4]) | |
cdna_end = int(cdna_aln[5]) | |
cdna_str = cdna_aln[7] | |
ref_str = cdna_aln[8] | |
if ref_str == '-': | |
cdna_start, cdna_end = cdna_end, cdna_start | |
cdna_seq = region_seq[cdna_start:cdna_end] | |
cigar_ops = '' | |
op = None | |
for i, cig in enumerate(cdna_aln[9:], 1): | |
if i % 2 == 0: | |
assert op is not None | |
if op in ('M', 'D'): | |
cigar_ops += op * int(cig) | |
op = None | |
else: | |
op = cig | |
assert len(cigar_ops) == len(cdna_seq), '%d vs %d' % (len(cigar_ops), len(cdna_seq)) | |
new_cdna = [] | |
for b,c in zip(list(cdna_seq), list(cigar_ops)): | |
if c == 'M': | |
new_cdna.append(b) | |
out[s].append(str(len(new_cdna))) # _cDNAlen | |
out[s].append(cdna_aln[6]) # _cDNAmatch | |
cdna_seq = ''.join(new_cdna) | |
if cdna_str != ref_str: | |
cdna_seq = rc(cdna_seq) | |
codon_intact = False | |
if len(cdna_seq) % 3 == 0: | |
codon_intact = True | |
if codon_intact: | |
tx_obj = Seq.Seq(tx_cdna[tx_id], generic_dna) | |
cdna_obj = Seq.Seq(cdna_seq, generic_dna) | |
tx_aa = str(tx_obj.translate(trans_table)) | |
orf_aa = str(cdna_obj.translate(trans_table)) | |
aa_align = exonerate_align(tx_aa, orf_aa) | |
orf_match = '0.0' | |
orf_len = 0 | |
if aa_align: | |
aa_fh.write('>%s\n%s\n' % (s, orf_aa)) | |
cdna_fh.write('>%s\n%s\n' % (s, cdna_seq)) | |
aa_out_count += 1 | |
orf_frags = sorted([len(aa) for aa in orf_aa.split('*')], reverse=True) | |
orf_len = str(orf_frags[0]) # _ORFmaxlen | |
orf_stops = str(orf_aa.count('*')) # _ORFstops | |
orf_match = aa_align[6] # _ORFmatch | |
else: | |
if s == 'human': | |
orf_noalign += 1 | |
output = False | |
out[s].append(str(orf_len)) # _ORFmaxlen | |
out[s].append(str(orf_stops)) # _ORFstops | |
out[s].append(orf_match) # _ORFmatch | |
else: | |
out[s].append('0') # _ORFmaxlen | |
out[s].append('0') # _ORFstops | |
out[s].append('0.0') # _ORFmatch | |
if s == 'human': | |
orf_disrupted += 1 | |
output = False | |
else: | |
out[s].append('0') # _cDNAlen | |
out[s].append('0.0') # _cDNAmatch | |
out[s].append('0') # _ORFmaxlen | |
out[s].append('0') # _ORFstops | |
out[s].append('0.0') # _ORFmatch | |
cdna_fh.close() | |
aa_fh.close() | |
if output: | |
# reset dnds table | |
for pair in sp_pairs: | |
sp_pairs[pair] = 'NA' | |
median_dnds = 'NA' | |
dnds_list = [] | |
if aa_out_count > 1: | |
codeml_res = dnds(aa_fn, cdna_fn, uniq_name) | |
for pair in codeml_res: | |
assert pair in sp_pairs | |
pw_dnds = codeml_res[pair]['dN/dS'] | |
dnds_list.append(pw_dnds) | |
sp_pairs[pair] = str(pw_dnds) | |
outline = tx_info[tx_id] | |
for s in ports: | |
outline.append(','.join(out[s])) | |
if len(dnds_list) > 0: | |
median_dnds = '%.4f' % np.median(dnds_list) | |
outline.append(median_dnds) | |
for p in sp_pairs: | |
outline.append(sp_pairs[p]) | |
print(','.join(outline)) | |
if not output: | |
os.remove(cdna_fn) | |
os.remove(aa_fn) | |
logger.info('human cdna noalign: %d' % cdna_noalign) | |
logger.info('human orf noalign: %d' % orf_noalign) | |
logger.info('human orf disrupted: %d' % orf_disrupted) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='start BLAT servers via: gfServer start localhost <portnumber> -stepSize=5 path/to.2bit') | |
parser.add_argument('--gff', required=True, help='gff, requires gene_id and transcript_id attributes. Should include transcript and CDS entries at a minimum.') | |
parser.add_argument('--flank', default=1000, help='flank for initial positioning alignments, default = 1000') | |
parser.add_argument('--ports', required=True, help='space-delimited .txt file: name port indexed.fa') | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment