adamewing/reannotate.py

## reannotate.py
#!/usr/bin/env python

'''
Join and re-annotate repeatmasker entities.

Expects input to be from UCSC public mysql server repeatmasker tables, using a command similar to the following:
echo "select genoName, genoStart, genoEnd, repStart, repEnd, strand, repName from rmsk where repClass = 'Other'" | mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg19 -N | sort -k1,1 -k2,2n

requires: pysam (for Fastafile), biopython, exonerate (https://www.ebi.ac.uk/~guy/exonerate/)

'''

import sys, os
import argparse
import pysam
import string
import random
import subprocess

from Bio import SeqIO

_MAX_OVERLAP = 10

class Repeat:
    def __init__(self, record):
        self.record   = record.strip().split()
        self.chrom    = self.record[0]
        self.start    = int(self.record[1])
        self.end      = int(self.record[2])
        self.repstart = int(self.record[3])
        self.repend   = int(self.record[4])
        self.strand   = self.record[5]
        self.name     = self.record[6]
        self.joined   = False
        self.score    = 0
        self.seq      = None
        self.oldname  = None

    def rename(self, newname):
        self.oldname = self.name
        self.name = newname

    def join(self, other):
        assert self.chrom == other.chrom
        assert self.strand == other.strand
        assert self.overlap(other) < _MAX_OVERLAP
        assert self.repoverlap(other) < _MAX_OVERLAP

        self.start    = min(self.start, other.start)
        self.end      = max(self.end, other.end)
        self.repstart = min(self.repstart, other.repstart)
        self.repend   = max(self.repend, other.repend)
        self.name     = self.name + "," + other.name
        self.joined   = True

    def overlap(self, other):
        ''' overlap in genome coords '''
        if min(self.end, other.end) - max(self.start, other.start) > 0:
            return True
        return False

    def repoverlap(self, other):
        ''' overlap in repeat reference coords '''
        if min(self.repend, other.repend) - max(self.repstart, other.repstart) > 0:
            return True
        return False

    def dist(self, other):
        if self.overlap(other):
            return 0

        return min(abs(self.start - other.end), abs(other.start - self.end))

    def getseq(self, ref):
        ''' ref is a pysam.Fastafile '''
        self.seq = ref.fetch(self.chrom, self.start, self.end)
        if self.strand == '-':
            self._rc()

    def __str__(self):
        out = '\t'.join(map(str, (self.chrom, self.start, self.end, self.name, self.score, self.strand)))
        return out

    def _rc(self):
        complements = string.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
        self.seq = self.seq.translate(complements)[::-1]


def align(qryseq, refseq):
    rnd = str(random.random())
    tgtfa = 'tmp.' + rnd + '.tgt.fa'
    qryfa = 'tmp.' + rnd + '.qry.fa'

    tgt = open(tgtfa, 'w')
    qry = open(qryfa, 'w')

    tgt.write('>ref' + '\n' + refseq + '\n')
    qry.write('>qry' + '\n' + qryseq + '\n')

    tgt.close()
    qry.close()

    cmd = ['exonerate', '--bestn', '1', '-m', 'affine:local', '--showalignment','0', '--ryo', 'SUMMARY\t%s\t%qab\t%qae\t%tab\t%tae\n', qryfa, tgtfa]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    best = []
    topscore = 0

    for pline in p.stdout.readlines():
        if pline.startswith('SUMMARY'):
            c = pline.strip().split()
            if int(c[1]) > topscore:
                topscore = int(c[1])
                best = int(c[1])

    os.remove(tgtfa)
    os.remove(qryfa)

    return best


def bestalign(qryseq, refdict):
    best_score = 0
    best_ref   = None

    for ref, seq in refdict.iteritems():
        score = align(qryseq, seq)
        if score > best_score:
            best_score = score
            best_ref = ref

    return best_ref, best_score


def main(args):
    replist = []
    beforecount = 0

    with open(args.annotation, 'r') as reps:
        lastrep = None
        for line in reps:
            beforecount += 1
            rep = Repeat(line)
            if lastrep is not None:
                if lastrep.chrom != rep.chrom:
                    replist.append(lastrep)
                    lastrep = None
                else:
                    if rep.dist(lastrep) < int(args.mindist):
                        if lastrep.strand == rep.strand and lastrep.repoverlap(rep) < _MAX_OVERLAP:
                            rep.join(lastrep)
                    else:
                        replist.append(lastrep)

            lastrep = rep
        replist.append(lastrep)

        sys.stderr.write("joined " + str(beforecount) + " repeats into " + str(len(replist)) + "\n")

        sys.stderr.write("getting sequences...\n")

        ref = pysam.Fastafile(args.genome)
        for rep in replist:
            rep.getseq(ref)

        sys.stderr.write("aligning to reference repeats...\n")
        refreps = {}

        for rec in SeqIO.parse(args.repref, 'fasta'):
            refreps[rec.id] = str(rec.seq)

        for rep in replist:
            assert rep.seq is not None
            if rep.joined:
                bestref, score = bestalign(rep.seq, refreps)
                rep.rename(bestref)
            try:
                rep.score = int(score/10)
            except Exception, e:
                sys.stderr.write("annotation: " + args.annotation + " warning, could not compute score for: " + str(rep) + "\n")

            print rep


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Join and reannotate repeatmasker entities')
    parser.add_argument('-a', '--annotation', dest='annotation', required=True, help="annotation: genoName, genoStart, genoEnd, repStart, repEnd, repName (sorted!)")
    parser.add_argument('-g', '--genome',     dest='genome',     required=True, help="reference genome indexed with samtools faidx")
    parser.add_argument('-r', '--repref',     dest='repref',     required=True, help="repeat reference sequences")
    parser.add_argument('-d', '--dist',       dest='mindist',    required=False, default=1000, help="search distance")
    args = parser.parse_args()
    main(args)
	#!/usr/bin/env python

	'''
	Join and re-annotate repeatmasker entities.

	Expects input to be from UCSC public mysql server repeatmasker tables, using a command similar to the following:
	echo "select genoName, genoStart, genoEnd, repStart, repEnd, strand, repName from rmsk where repClass = 'Other'" \| mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg19 -N \| sort -k1,1 -k2,2n

	requires: pysam (for Fastafile), biopython, exonerate (https://www.ebi.ac.uk/~guy/exonerate/)

	'''

	import sys, os
	import argparse
	import pysam
	import string
	import random
	import subprocess

	from Bio import SeqIO

	_MAX_OVERLAP = 10

	class Repeat:
	def __init__(self, record):
	self.record = record.strip().split()
	self.chrom = self.record[0]
	self.start = int(self.record[1])
	self.end = int(self.record[2])
	self.repstart = int(self.record[3])
	self.repend = int(self.record[4])
	self.strand = self.record[5]
	self.name = self.record[6]
	self.joined = False
	self.score = 0
	self.seq = None
	self.oldname = None

	def rename(self, newname):
	self.oldname = self.name
	self.name = newname

	def join(self, other):
	assert self.chrom == other.chrom
	assert self.strand == other.strand
	assert self.overlap(other) < _MAX_OVERLAP
	assert self.repoverlap(other) < _MAX_OVERLAP

	self.start = min(self.start, other.start)
	self.end = max(self.end, other.end)
	self.repstart = min(self.repstart, other.repstart)
	self.repend = max(self.repend, other.repend)
	self.name = self.name + "," + other.name
	self.joined = True

	def overlap(self, other):
	''' overlap in genome coords '''
	if min(self.end, other.end) - max(self.start, other.start) > 0:
	return True
	return False

	def repoverlap(self, other):
	''' overlap in repeat reference coords '''
	if min(self.repend, other.repend) - max(self.repstart, other.repstart) > 0:
	return True
	return False

	def dist(self, other):
	if self.overlap(other):
	return 0

	return min(abs(self.start - other.end), abs(other.start - self.end))

	def getseq(self, ref):
	''' ref is a pysam.Fastafile '''
	self.seq = ref.fetch(self.chrom, self.start, self.end)
	if self.strand == '-':
	self._rc()

	def __str__(self):
	out = '\t'.join(map(str, (self.chrom, self.start, self.end, self.name, self.score, self.strand)))
	return out

	def _rc(self):
	complements = string.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
	self.seq = self.seq.translate(complements)[::-1]


	def align(qryseq, refseq):
	rnd = str(random.random())
	tgtfa = 'tmp.' + rnd + '.tgt.fa'
	qryfa = 'tmp.' + rnd + '.qry.fa'

	tgt = open(tgtfa, 'w')
	qry = open(qryfa, 'w')

	tgt.write('>ref' + '\n' + refseq + '\n')
	qry.write('>qry' + '\n' + qryseq + '\n')

	tgt.close()
	qry.close()

	cmd = ['exonerate', '--bestn', '1', '-m', 'affine:local', '--showalignment','0', '--ryo', 'SUMMARY\t%s\t%qab\t%qae\t%tab\t%tae\n', qryfa, tgtfa]
	p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	best = []
	topscore = 0

	for pline in p.stdout.readlines():
	if pline.startswith('SUMMARY'):
	c = pline.strip().split()
	if int(c[1]) > topscore:
	topscore = int(c[1])
	best = int(c[1])

	os.remove(tgtfa)
	os.remove(qryfa)

	return best


	def bestalign(qryseq, refdict):
	best_score = 0
	best_ref = None

	for ref, seq in refdict.iteritems():
	score = align(qryseq, seq)
	if score > best_score:
	best_score = score
	best_ref = ref

	return best_ref, best_score


	def main(args):
	replist = []
	beforecount = 0

	with open(args.annotation, 'r') as reps:
	lastrep = None
	for line in reps:
	beforecount += 1
	rep = Repeat(line)
	if lastrep is not None:
	if lastrep.chrom != rep.chrom:
	replist.append(lastrep)
	lastrep = None
	else:
	if rep.dist(lastrep) < int(args.mindist):
	if lastrep.strand == rep.strand and lastrep.repoverlap(rep) < _MAX_OVERLAP:
	rep.join(lastrep)
	else:
	replist.append(lastrep)

	lastrep = rep
	replist.append(lastrep)

	sys.stderr.write("joined " + str(beforecount) + " repeats into " + str(len(replist)) + "\n")

	sys.stderr.write("getting sequences...\n")

	ref = pysam.Fastafile(args.genome)
	for rep in replist:
	rep.getseq(ref)

	sys.stderr.write("aligning to reference repeats...\n")
	refreps = {}

	for rec in SeqIO.parse(args.repref, 'fasta'):
	refreps[rec.id] = str(rec.seq)

	for rep in replist:
	assert rep.seq is not None
	if rep.joined:
	bestref, score = bestalign(rep.seq, refreps)
	rep.rename(bestref)
	try:
	rep.score = int(score/10)
	except Exception, e:
	sys.stderr.write("annotation: " + args.annotation + " warning, could not compute score for: " + str(rep) + "\n")

	print rep


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Join and reannotate repeatmasker entities')
	parser.add_argument('-a', '--annotation', dest='annotation', required=True, help="annotation: genoName, genoStart, genoEnd, repStart, repEnd, repName (sorted!)")
	parser.add_argument('-g', '--genome', dest='genome', required=True, help="reference genome indexed with samtools faidx")
	parser.add_argument('-r', '--repref', dest='repref', required=True, help="repeat reference sequences")
	parser.add_argument('-d', '--dist', dest='mindist', required=False, default=1000, help="search distance")
	args = parser.parse_args()
	main(args)