Created
August 11, 2014 00:47
-
-
Save adamewing/58bbe8817b1662745bd8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
Join and re-annotate repeatmasker entities. | |
Expects input to be from UCSC public mysql server repeatmasker tables, using a command similar to the following: | |
echo "select genoName, genoStart, genoEnd, repStart, repEnd, strand, repName from rmsk where repClass = 'Other'" | mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg19 -N | sort -k1,1 -k2,2n | |
requires: pysam (for Fastafile), biopython, exonerate (https://www.ebi.ac.uk/~guy/exonerate/) | |
''' | |
import sys, os | |
import argparse | |
import pysam | |
import string | |
import random | |
import subprocess | |
from Bio import SeqIO | |
_MAX_OVERLAP = 10 | |
class Repeat: | |
def __init__(self, record): | |
self.record = record.strip().split() | |
self.chrom = self.record[0] | |
self.start = int(self.record[1]) | |
self.end = int(self.record[2]) | |
self.repstart = int(self.record[3]) | |
self.repend = int(self.record[4]) | |
self.strand = self.record[5] | |
self.name = self.record[6] | |
self.joined = False | |
self.score = 0 | |
self.seq = None | |
self.oldname = None | |
def rename(self, newname): | |
self.oldname = self.name | |
self.name = newname | |
def join(self, other): | |
assert self.chrom == other.chrom | |
assert self.strand == other.strand | |
assert self.overlap(other) < _MAX_OVERLAP | |
assert self.repoverlap(other) < _MAX_OVERLAP | |
self.start = min(self.start, other.start) | |
self.end = max(self.end, other.end) | |
self.repstart = min(self.repstart, other.repstart) | |
self.repend = max(self.repend, other.repend) | |
self.name = self.name + "," + other.name | |
self.joined = True | |
def overlap(self, other): | |
''' overlap in genome coords ''' | |
if min(self.end, other.end) - max(self.start, other.start) > 0: | |
return True | |
return False | |
def repoverlap(self, other): | |
''' overlap in repeat reference coords ''' | |
if min(self.repend, other.repend) - max(self.repstart, other.repstart) > 0: | |
return True | |
return False | |
def dist(self, other): | |
if self.overlap(other): | |
return 0 | |
return min(abs(self.start - other.end), abs(other.start - self.end)) | |
def getseq(self, ref): | |
''' ref is a pysam.Fastafile ''' | |
self.seq = ref.fetch(self.chrom, self.start, self.end) | |
if self.strand == '-': | |
self._rc() | |
def __str__(self): | |
out = '\t'.join(map(str, (self.chrom, self.start, self.end, self.name, self.score, self.strand))) | |
return out | |
def _rc(self): | |
complements = string.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB') | |
self.seq = self.seq.translate(complements)[::-1] | |
def align(qryseq, refseq): | |
rnd = str(random.random()) | |
tgtfa = 'tmp.' + rnd + '.tgt.fa' | |
qryfa = 'tmp.' + rnd + '.qry.fa' | |
tgt = open(tgtfa, 'w') | |
qry = open(qryfa, 'w') | |
tgt.write('>ref' + '\n' + refseq + '\n') | |
qry.write('>qry' + '\n' + qryseq + '\n') | |
tgt.close() | |
qry.close() | |
cmd = ['exonerate', '--bestn', '1', '-m', 'affine:local', '--showalignment','0', '--ryo', 'SUMMARY\t%s\t%qab\t%qae\t%tab\t%tae\n', qryfa, tgtfa] | |
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
best = [] | |
topscore = 0 | |
for pline in p.stdout.readlines(): | |
if pline.startswith('SUMMARY'): | |
c = pline.strip().split() | |
if int(c[1]) > topscore: | |
topscore = int(c[1]) | |
best = int(c[1]) | |
os.remove(tgtfa) | |
os.remove(qryfa) | |
return best | |
def bestalign(qryseq, refdict): | |
best_score = 0 | |
best_ref = None | |
for ref, seq in refdict.iteritems(): | |
score = align(qryseq, seq) | |
if score > best_score: | |
best_score = score | |
best_ref = ref | |
return best_ref, best_score | |
def main(args): | |
replist = [] | |
beforecount = 0 | |
with open(args.annotation, 'r') as reps: | |
lastrep = None | |
for line in reps: | |
beforecount += 1 | |
rep = Repeat(line) | |
if lastrep is not None: | |
if lastrep.chrom != rep.chrom: | |
replist.append(lastrep) | |
lastrep = None | |
else: | |
if rep.dist(lastrep) < int(args.mindist): | |
if lastrep.strand == rep.strand and lastrep.repoverlap(rep) < _MAX_OVERLAP: | |
rep.join(lastrep) | |
else: | |
replist.append(lastrep) | |
lastrep = rep | |
replist.append(lastrep) | |
sys.stderr.write("joined " + str(beforecount) + " repeats into " + str(len(replist)) + "\n") | |
sys.stderr.write("getting sequences...\n") | |
ref = pysam.Fastafile(args.genome) | |
for rep in replist: | |
rep.getseq(ref) | |
sys.stderr.write("aligning to reference repeats...\n") | |
refreps = {} | |
for rec in SeqIO.parse(args.repref, 'fasta'): | |
refreps[rec.id] = str(rec.seq) | |
for rep in replist: | |
assert rep.seq is not None | |
if rep.joined: | |
bestref, score = bestalign(rep.seq, refreps) | |
rep.rename(bestref) | |
try: | |
rep.score = int(score/10) | |
except Exception, e: | |
sys.stderr.write("annotation: " + args.annotation + " warning, could not compute score for: " + str(rep) + "\n") | |
print rep | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Join and reannotate repeatmasker entities') | |
parser.add_argument('-a', '--annotation', dest='annotation', required=True, help="annotation: genoName, genoStart, genoEnd, repStart, repEnd, repName (sorted!)") | |
parser.add_argument('-g', '--genome', dest='genome', required=True, help="reference genome indexed with samtools faidx") | |
parser.add_argument('-r', '--repref', dest='repref', required=True, help="repeat reference sequences") | |
parser.add_argument('-d', '--dist', dest='mindist', required=False, default=1000, help="search distance") | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment