Skip to content

Instantly share code, notes, and snippets.

@adamewing
Created August 11, 2014 00:47
Show Gist options
  • Save adamewing/58bbe8817b1662745bd8 to your computer and use it in GitHub Desktop.
Save adamewing/58bbe8817b1662745bd8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
'''
Join and re-annotate repeatmasker entities.
Expects input to be from UCSC public mysql server repeatmasker tables, using a command similar to the following:
echo "select genoName, genoStart, genoEnd, repStart, repEnd, strand, repName from rmsk where repClass = 'Other'" | mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg19 -N | sort -k1,1 -k2,2n
requires: pysam (for Fastafile), biopython, exonerate (https://www.ebi.ac.uk/~guy/exonerate/)
'''
import sys, os
import argparse
import pysam
import string
import random
import subprocess
from Bio import SeqIO
_MAX_OVERLAP = 10
class Repeat:
def __init__(self, record):
self.record = record.strip().split()
self.chrom = self.record[0]
self.start = int(self.record[1])
self.end = int(self.record[2])
self.repstart = int(self.record[3])
self.repend = int(self.record[4])
self.strand = self.record[5]
self.name = self.record[6]
self.joined = False
self.score = 0
self.seq = None
self.oldname = None
def rename(self, newname):
self.oldname = self.name
self.name = newname
def join(self, other):
assert self.chrom == other.chrom
assert self.strand == other.strand
assert self.overlap(other) < _MAX_OVERLAP
assert self.repoverlap(other) < _MAX_OVERLAP
self.start = min(self.start, other.start)
self.end = max(self.end, other.end)
self.repstart = min(self.repstart, other.repstart)
self.repend = max(self.repend, other.repend)
self.name = self.name + "," + other.name
self.joined = True
def overlap(self, other):
''' overlap in genome coords '''
if min(self.end, other.end) - max(self.start, other.start) > 0:
return True
return False
def repoverlap(self, other):
''' overlap in repeat reference coords '''
if min(self.repend, other.repend) - max(self.repstart, other.repstart) > 0:
return True
return False
def dist(self, other):
if self.overlap(other):
return 0
return min(abs(self.start - other.end), abs(other.start - self.end))
def getseq(self, ref):
''' ref is a pysam.Fastafile '''
self.seq = ref.fetch(self.chrom, self.start, self.end)
if self.strand == '-':
self._rc()
def __str__(self):
out = '\t'.join(map(str, (self.chrom, self.start, self.end, self.name, self.score, self.strand)))
return out
def _rc(self):
complements = string.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
self.seq = self.seq.translate(complements)[::-1]
def align(qryseq, refseq):
rnd = str(random.random())
tgtfa = 'tmp.' + rnd + '.tgt.fa'
qryfa = 'tmp.' + rnd + '.qry.fa'
tgt = open(tgtfa, 'w')
qry = open(qryfa, 'w')
tgt.write('>ref' + '\n' + refseq + '\n')
qry.write('>qry' + '\n' + qryseq + '\n')
tgt.close()
qry.close()
cmd = ['exonerate', '--bestn', '1', '-m', 'affine:local', '--showalignment','0', '--ryo', 'SUMMARY\t%s\t%qab\t%qae\t%tab\t%tae\n', qryfa, tgtfa]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
best = []
topscore = 0
for pline in p.stdout.readlines():
if pline.startswith('SUMMARY'):
c = pline.strip().split()
if int(c[1]) > topscore:
topscore = int(c[1])
best = int(c[1])
os.remove(tgtfa)
os.remove(qryfa)
return best
def bestalign(qryseq, refdict):
best_score = 0
best_ref = None
for ref, seq in refdict.iteritems():
score = align(qryseq, seq)
if score > best_score:
best_score = score
best_ref = ref
return best_ref, best_score
def main(args):
replist = []
beforecount = 0
with open(args.annotation, 'r') as reps:
lastrep = None
for line in reps:
beforecount += 1
rep = Repeat(line)
if lastrep is not None:
if lastrep.chrom != rep.chrom:
replist.append(lastrep)
lastrep = None
else:
if rep.dist(lastrep) < int(args.mindist):
if lastrep.strand == rep.strand and lastrep.repoverlap(rep) < _MAX_OVERLAP:
rep.join(lastrep)
else:
replist.append(lastrep)
lastrep = rep
replist.append(lastrep)
sys.stderr.write("joined " + str(beforecount) + " repeats into " + str(len(replist)) + "\n")
sys.stderr.write("getting sequences...\n")
ref = pysam.Fastafile(args.genome)
for rep in replist:
rep.getseq(ref)
sys.stderr.write("aligning to reference repeats...\n")
refreps = {}
for rec in SeqIO.parse(args.repref, 'fasta'):
refreps[rec.id] = str(rec.seq)
for rep in replist:
assert rep.seq is not None
if rep.joined:
bestref, score = bestalign(rep.seq, refreps)
rep.rename(bestref)
try:
rep.score = int(score/10)
except Exception, e:
sys.stderr.write("annotation: " + args.annotation + " warning, could not compute score for: " + str(rep) + "\n")
print rep
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Join and reannotate repeatmasker entities')
parser.add_argument('-a', '--annotation', dest='annotation', required=True, help="annotation: genoName, genoStart, genoEnd, repStart, repEnd, repName (sorted!)")
parser.add_argument('-g', '--genome', dest='genome', required=True, help="reference genome indexed with samtools faidx")
parser.add_argument('-r', '--repref', dest='repref', required=True, help="repeat reference sequences")
parser.add_argument('-d', '--dist', dest='mindist', required=False, default=1000, help="search distance")
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment