Skip to content

Instantly share code, notes, and snippets.

@adamewing
Last active March 15, 2021 06:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adamewing/3a4cfa8eb1a333ee9c497538ce30b6db to your computer and use it in GitHub Desktop.
Save adamewing/3a4cfa8eb1a333ee9c497538ce30b6db to your computer and use it in GitHub Desktop.
Script for conservation table for Troskie et al.
#!/usr/bin/env python
import sys
import os
import argparse
import subprocess
import pysam
import tempfile
import multiprocessing as mp
import re
import itertools
from collections import defaultdict as dd
from uuid import uuid4
from time import sleep
from math import log
from Bio import Seq, SeqIO
from Bio.Alphabet import generic_dna
from Bio.Data import CodonTable
import numpy as np
import logging
FORMAT = '%(asctime)s %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class Block:
def __init__(self, tstart, tend):
self.tstart = min(int(tstart), int(tend))
self.tend = max(int(tstart), int(tend))
def __len__(self):
return self.tend - self.tstart
def __str__(self):
return str(self.tstart) + ' ' + str(self.tend)
def __lt__(self, other):
return len(self) > len(other)
class PSL:
def __init__(self, rec):
# psl spec: http://www.ensembl.org/info/website/upload/psl.html
(self.matches, self.misMatches, self.repMatches, self.nCount, self.qNumInsert, self.qBaseInsert,
self.tNumInsert, self.tBaseInsert, self.strand, self.qName, self.qSize, self.qStart, self.qEnd,
self.tName, self.tSize, self.tStart, self.tEnd, self.blockCount, self.blockSizes, self.qStarts,
self.tStarts) = rec.strip().split()
self.tBlocks = []
for bsize, tstart in zip(self.blockSizes.split(',')[:-1], self.tStarts.split(',')[:-1]): # [:-1] due to trailing comma
self.tBlocks.append(Block(int(tstart), int(tstart) + int(bsize)))
self.tBlocks.sort()
self.tStart, self.tEnd, self.qStart, self.qEnd = map(int, (self.tStart, self.tEnd, self.qStart, self.qEnd))
self.qSize, self.tSize = map(int, (self.qSize, self.tSize))
if self.qStart > self.qEnd:
self.qStart, self.qEnd = self.qEnd, self.qStart
if self.tStart > self.tEnd:
self.tStart, self.tEnd = self.tEnd, self.tStart
def match(self, chrom, pos, window=0):
''' return True if chrom:pos intersects BLAT hit +/- window '''
chrom = chrom.replace('chr', '')
if chrom != self.tName:
return False
if int(pos) >= int(self.tStart)-window and int(pos) <= int(self.tEnd)+window:
return True
return False
def refspan(self):
''' return footprint of match relative to referece genome '''
return self.tEnd - self.tStart
def score(self):
''' adapted from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 '''
return (int(self.matches) + (int(self.repMatches)>>1)) - int(self.misMatches) - int(self.qNumInsert) - int(self.tNumInsert)
def pctmatch(self):
''' adapted from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 '''
qAliSize = int(self.qEnd) - int(self.qStart)
tAliSize = int(self.tEnd) - int(self.tStart)
if min(qAliSize, tAliSize) <= 0:
return 0.0
sizeDif = abs(qAliSize - tAliSize)
total = int(self.matches) + int(self.repMatches) + int(self.misMatches)
if total > 0:
return 1.0-float((int(self.misMatches) + int(self.qNumInsert) + int(self.tNumInsert) + round(3*log(1+sizeDif)))) / float(total)
return 0.0
def sumblocks(self):
return sum([len(b) for b in self.tBlocks])
def __lt__(self, other):
''' used for ranking BLAT hits '''
return self.score() > other.score()
def rc(dna):
''' reverse complement '''
complements = str.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
return dna.translate(complements)[::-1]
def exonerate_align(qryseq, refseq, cdna=False, tmpdir='/tmp', minmatch=80.0):
rnd = str(uuid4())
tgtfa = tmpdir + '/tmp.' + rnd + '.tgt.fa'
qryfa = tmpdir + '/tmp.' + rnd + '.qry.fa'
tgt = open(tgtfa, 'w')
qry = open(qryfa, 'w')
tgt.write('>ref' + '\n' + refseq + '\n')
qry.write('>qry' + '\n' + qryseq + '\n')
tgt.close()
qry.close()
cmd = ['exonerate', '--bestn', '1', '-m', 'ungapped', '--showalignment', '1', '--ryo', 'ALN' + '\t%s\t%qab\t%qae\t%tab\t%tae\t%pi\t%qS\t%tS\t%C\n', qryfa, tgtfa]
if cdna:
cmd = ['exonerate', '--bestn', '1', '-m', 'cdna2genome', '--showalignment', '1', '--ryo', 'ALN' + '\t%s\t%qab\t%qae\t%tab\t%tae\t%pi\t%qS\t%tS\t%C\n', qryfa, tgtfa]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
best = []
topscore = 0
for pline in p.stdout.readlines():
pline = pline.decode()
if pline.startswith('ALN'):
c = pline.strip().split()
if int(c[1]) > topscore and float(c[6]) >= minmatch:
topscore = int(c[1])
best = c
os.remove(tgtfa)
os.remove(qryfa)
return best
def blat(seq, port, minScore=0, maxIntron=None):
''' BLAT using gfClient utility '''
fasta_tmp_fn = str(uuid4()) + '.fa'
with open(fasta_tmp_fn, 'w') as fasta_tmp:
fasta_tmp.write('>query\n%s\n' % seq)
cmd = ['gfClient', 'localhost', str(port), '-nohead']
if maxIntron is not None:
cmd.append('-maxIntron=' + str(maxIntron))
if minScore is not None:
cmd.append('-minScore=' + str(minScore))
psl_tmp_fn = str(uuid4()) + '.psl'
cmd += ['.', fasta_tmp_fn, psl_tmp_fn]
FNULL = open(os.devnull, 'w')
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=FNULL)
result = []
for out_line in p.stdout:
out_line = out_line.decode()
with open(psl_tmp_fn) as psl:
for rec in psl:
result.append(PSL(rec))
os.remove(psl_tmp_fn)
os.remove(fasta_tmp_fn)
return sorted(result)
def write_codeml_ctl(name):
with open(name + '.codeml.ctl', 'w') as out:
out.write('seqfile = %s.pal2nal\n' % name)
out.write('outfile = %s.codeml.txt\n' % name)
out.write('noisy = 0\n')
out.write('verbose = 0\n')
out.write('runmode = -2\n')
out.write('seqtype = 1\n')
out.write('CodonFreq = 2\n')
out.write('model = 1\n')
out.write('NSsites = 0\n')
out.write('icode = 0\n')
out.write('fix_kappa = 1\n')
out.write('kappa = 1\n')
out.write('fix_omega = 0\n')
out.write('omega = 0.5\n')
return name + '.codeml.ctl', name + '.codeml.txt'
def dnds(aa_fa, nuc_fa, name):
FNULL = open(os.devnull, 'w')
codeml_ctl, codeml_out = write_codeml_ctl(name)
aln_out = name + '.aa.aln.fa'
clustalo_cmd = ['clustalo', '-i', aa_fa, '-o', aln_out, '--force']
p = subprocess.Popen(clustalo_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
for line in p.stdout:
line = line.decode()
logger.debug('clustalo stdout: %s' % line.strip())
for line in p.stderr:
line = line.decode()
logger.debug('clustalo stderr: %s' % line.strip())
pal2nal_out = name + '.pal2nal'
pal2nal_cmd = ['pal2nal.pl', aln_out, nuc_fa, '-output', 'paml', '-nogap']
p = subprocess.Popen(pal2nal_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
with open(pal2nal_out, 'w') as p2n_out:
for line in p.stdout:
line = line.decode()
p2n_out.write(line)
p = subprocess.Popen(['codeml', codeml_ctl], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
for line in p.stdout:
line = line.decode()
logger.debug('codeml stdout: %s' % line.strip())
for line in p.stderr:
line = line.decode()
logger.debug('codeml stderr: %s' % line.strip())
pass
result = {}
with open(codeml_out) as cml:
pairwise_section = False
pair = None
for line in cml:
if line.startswith('pairwise comparison, codon frequencies'):
pairwise_section = True
continue
if pairwise_section:
if line.strip() == "":
continue
if pair is None:
sp1, sp2 = line.strip().split(' ... ')
sp1 = sp1.split('(')[-1].rstrip(')')
sp2 = sp2.split('(')[-1].rstrip(')')
sp1, sp2 = sorted((sp1, sp2))
pair = 'dNdS_%s_%s' % (sp1, sp2)
elif line.startswith('t='):
assert pair is not None
line = line.strip()
line = re.sub('=\s+', '=', line)
line = re.sub('\s+=', '=', line)
vars = dict(v.split('=') for v in line.split())
for k,v in vars.items():
vars[k] = float(v)
if vars['dS'] < 2 and vars['dS'] > 0.01 and vars['dN/dS'] < 10:
result[pair] = vars
pair = None
return result
def main(args):
trans_table = CodonTable.unambiguous_dna_by_name["Standard"]
ports = {}
ref = {}
with open(args.ports) as _:
for line in _:
name, port, fa = line.strip().split()
ports[name] = port
ref[name] = pysam.Fastafile(fa)
pool = mp.Pool(processes=len(ports))
header = ['gene', 'tx', 'tx_len', 'padding']
subfields = ['_loc', '_score', '_pctmatch', '_matchlen', '_cDNAlen', '_cDNAmatch', '_ORFmaxlen', '_ORFstops', '_ORFmatch']
species = []
for s in ports:
header += [s+sf for sf in subfields]
species.append(s)
header.append('median_dNdS')
sp_pairs = {}
for s1, s2 in itertools.product(species, species):
if s1 == s2:
continue
s1, s2 = sorted((s1, s2))
pair = 'dNdS_%s_%s' % (s1, s2)
if pair not in sp_pairs:
sp_pairs[pair] = 'NA'
header += list(sp_pairs.keys())
print(','.join(header))
tx_seq = {}
tx_info = {}
tx_cds = dd(list)
with open(args.gff) as gff:
for line in gff:
if line.startswith('#'):
continue
chrom, source, feature, start, end, score, strand, frame, attribs = line.split('\t')
attr_dict = {}
for attrib in attribs.split(';'):
if attrib.strip().split():
key, val = attrib.strip().split()[:2]
key = key.strip()
val = val.strip().strip('"')
attr_dict[key] = val
if not chrom.startswith('chr'):
chrom = 'chr' + chrom
if chrom not in ref['human'].references:
logger.warning('chromosome not found: %s' % chrom)
continue
start = int(start)
end = int(end)
gene = attr_dict['gene_id']
tx_id = attr_dict['transcript_id']
if feature == 'transcript':
f = int(args.flank)
tx_seq[tx_id] = ref['human'].fetch(chrom, start-f, end+f)
tx_info[tx_id] = [gene, tx_id, str(end-start), str(args.flank)]
if feature == 'CDS':
tx_cds[tx_id].append((chrom, start, end, strand))
tx_cdna = dd(str)
for tx_id in tx_cds:
for cds in tx_cds[tx_id]:
chrom, start, end, strand = cds
seq = ref['human'].fetch(chrom, start-1, end)
if strand == '-':
seq = rc(seq)
tx_cdna[tx_id] += seq
if not os.path.exists('seqout'):
os.mkdir('seqout')
orf_disrupted = 0
orf_noalign = 0
cdna_noalign = 0
aa_out_count = 0
for tx_id in tx_cds:
results = []
output = True
uniq_name = '%s_%s' % (tx_info[tx_id][0], tx_info[tx_id][1])
cdna_fn = 'seqout/%s_%s.cDNA.fa' % (tx_info[tx_id][0], tx_info[tx_id][1])
aa_fn = 'seqout/%s_%s.aa.fa' % (tx_info[tx_id][0], tx_info[tx_id][1])
cdna_fh = open(cdna_fn, 'w')
aa_fh = open(aa_fn, 'w')
for s in ports:
res = pool.apply_async(blat, [tx_seq[tx_id], ports[s]])
results.append((s, res))
out = dd(list)
for s, res in results:
aln = res.get()
if len(aln) == 0:
out[s] = ('N/A','0','0','0','0','0','0','0')
best = aln[0]
loc = '%s:%d-%d' % (best.tName, best.tStart, best.tEnd)
region_seq = ref[s].fetch(best.tName, best.tStart, best.tEnd).upper()
cdna_aln = exonerate_align(tx_cdna[tx_id], region_seq, cdna=True)
if s == 'human' and len(cdna_aln) == 0:
output = False
cdna_noalign =+ 1
out[s].append(loc)
out[s].append('%d' % best.score())
out[s].append('%.2f' % best.pctmatch())
out[s].append('%d' % best.refspan())
orf_aa = None
if cdna_aln:
cdna_start = int(cdna_aln[4])
cdna_end = int(cdna_aln[5])
cdna_str = cdna_aln[7]
ref_str = cdna_aln[8]
if ref_str == '-':
cdna_start, cdna_end = cdna_end, cdna_start
cdna_seq = region_seq[cdna_start:cdna_end]
cigar_ops = ''
op = None
for i, cig in enumerate(cdna_aln[9:], 1):
if i % 2 == 0:
assert op is not None
if op in ('M', 'D'):
cigar_ops += op * int(cig)
op = None
else:
op = cig
assert len(cigar_ops) == len(cdna_seq), '%d vs %d' % (len(cigar_ops), len(cdna_seq))
new_cdna = []
for b,c in zip(list(cdna_seq), list(cigar_ops)):
if c == 'M':
new_cdna.append(b)
out[s].append(str(len(new_cdna))) # _cDNAlen
out[s].append(cdna_aln[6]) # _cDNAmatch
cdna_seq = ''.join(new_cdna)
if cdna_str != ref_str:
cdna_seq = rc(cdna_seq)
codon_intact = False
if len(cdna_seq) % 3 == 0:
codon_intact = True
if codon_intact:
tx_obj = Seq.Seq(tx_cdna[tx_id], generic_dna)
cdna_obj = Seq.Seq(cdna_seq, generic_dna)
tx_aa = str(tx_obj.translate(trans_table))
orf_aa = str(cdna_obj.translate(trans_table))
aa_align = exonerate_align(tx_aa, orf_aa)
orf_match = '0.0'
orf_len = 0
if aa_align:
aa_fh.write('>%s\n%s\n' % (s, orf_aa))
cdna_fh.write('>%s\n%s\n' % (s, cdna_seq))
aa_out_count += 1
orf_frags = sorted([len(aa) for aa in orf_aa.split('*')], reverse=True)
orf_len = str(orf_frags[0]) # _ORFmaxlen
orf_stops = str(orf_aa.count('*')) # _ORFstops
orf_match = aa_align[6] # _ORFmatch
else:
if s == 'human':
orf_noalign += 1
output = False
out[s].append(str(orf_len)) # _ORFmaxlen
out[s].append(str(orf_stops)) # _ORFstops
out[s].append(orf_match) # _ORFmatch
else:
out[s].append('0') # _ORFmaxlen
out[s].append('0') # _ORFstops
out[s].append('0.0') # _ORFmatch
if s == 'human':
orf_disrupted += 1
output = False
else:
out[s].append('0') # _cDNAlen
out[s].append('0.0') # _cDNAmatch
out[s].append('0') # _ORFmaxlen
out[s].append('0') # _ORFstops
out[s].append('0.0') # _ORFmatch
cdna_fh.close()
aa_fh.close()
if output:
# reset dnds table
for pair in sp_pairs:
sp_pairs[pair] = 'NA'
median_dnds = 'NA'
dnds_list = []
if aa_out_count > 1:
codeml_res = dnds(aa_fn, cdna_fn, uniq_name)
for pair in codeml_res:
assert pair in sp_pairs
pw_dnds = codeml_res[pair]['dN/dS']
dnds_list.append(pw_dnds)
sp_pairs[pair] = str(pw_dnds)
outline = tx_info[tx_id]
for s in ports:
outline.append(','.join(out[s]))
if len(dnds_list) > 0:
median_dnds = '%.4f' % np.median(dnds_list)
outline.append(median_dnds)
for p in sp_pairs:
outline.append(sp_pairs[p])
print(','.join(outline))
if not output:
os.remove(cdna_fn)
os.remove(aa_fn)
logger.info('human cdna noalign: %d' % cdna_noalign)
logger.info('human orf noalign: %d' % orf_noalign)
logger.info('human orf disrupted: %d' % orf_disrupted)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='start BLAT servers via: gfServer start localhost <portnumber> -stepSize=5 path/to.2bit')
parser.add_argument('--gff', required=True, help='gff, requires gene_id and transcript_id attributes. Should include transcript and CDS entries at a minimum.')
parser.add_argument('--flank', default=1000, help='flank for initial positioning alignments, default = 1000')
parser.add_argument('--ports', required=True, help='space-delimited .txt file: name port indexed.fa')
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment