Skip to content

Instantly share code, notes, and snippets.

@dwinter
Forked from chapmanb/gist:862240
Created April 21, 2011 04:50
Show Gist options
  • Save dwinter/933737 to your computer and use it in GitHub Desktop.
Save dwinter/933737 to your computer and use it in GitHub Desktop.
"""
Code writted to answer this challenge at Biostar:
http://biostar.stackexchange.com/questions/5902/
(This code includes improvements from Brad Chapman)
"""
class ORFFinder:
"""Find the longest ORF in a given sequence
"seq" is a string, if "start" is not provided any codon can be the start of
and ORF. If muliple ORFs have the longest length the first one encountered
is printed
"""
def __init__(self, seq, start=[], stop=["TAG", "TAA", "TGA"]):
self.seq = seq
self.start = start
self.stop = stop
self.result = ("+",0,0,0,0)
self.winner = 0
def _reverse_comp(self):
swap = {"A":"T", "T":"A", "C":"G", "G":"C"}
return "".join(swap[b] for b in self.seq)
def _print_current(self):
print "frame %s%s position %s:%s (%s nucleotides)" % self.result
def codons(self, frame):
""" A generator that yields DNA in one codon blocks
"frame" counts for 0. This function yelids a tuple (triplet, index) with
index relative to the original DNA sequence
"""
start = frame
while start + 3 <= len(self.seq):
yield (self.seq[start:start+3], start)
start += 3
def run_one(self, frame_number, direction):
""" Search in one reading frame """
orf_start = None
for c, index in self.codons(frame_number):
if (c not in self.stop and (c in self.start or not self.start)
and orf_start is None):
orf_start = index + 1 # we'll return the result as 1-indexed
elif c in self.stop and orf_start:
self._update_winner(orf_start, index, direction, frame_number)
orf_start = None
if orf_start:
self._update_winner(orf_start, index, direction, frame_number)
def _update_winner(self, orf_start, index, direction, frame_number):
orf_end = index + 3 # because index is realitve to start of codons
L = (orf_end - orf_start) + 1
if L > self.winner:
self.winner = L
self.result = (direction, frame_number+1, orf_start, orf_end, L)
def run(self):
direction = "+"
for frame in range(3):
self.run_one(frame, direction)
direction = "-"
self.seq = self._reverse_comp()
for frame in range(3):
self.run_one(frame, direction)
self._print_current()
def find_ORF(seq_string):
ORFFinder(seq_string).run()
def little_test():
find_ORF("ACGTACGTACGTACGT")
def bigger_test():
from Bio import SeqIO
a = SeqIO.parse("test-in.fa", "fasta")
for seq in a:
print seq.id
find_ORF(seq.seq.tostring())
if __name__ == "__main__":
print "\nrunning original test sequence..."
little_test()
print "\nrunning Brad's test file..."
bigger_test()
>t-1
TTATACGTACGGTAA
>t-2
TTATAAGTACGGTAAG
>t-3
TTATAAGTAAGGTAAGC
>t-r1
TTATAAGTAAGTAAA
>t-r2
TGAGTTATAATTAAGTAAA
>t-r3
GTGATTAATTAAGTAAA
>t-1-inner
TAAATGATTAATCAATTATAA
>EX720612.1
ACATATAAGACGCACCAAATACTTTCTCCCTTCCCACCGCCGTCCACGTTCTCTCACATTATACAAGCAT
CAACTTCAAACACATTGACAGAAAATGACATTATCAAATTCATCATATTACAAAGTTCTTGCATCCTTGA
AGGTGGTAGCTCTTGCGGAAATGCTACTAACAGATCACACCGCCATTTTTCACATGCGTTCGGGATCTTC
TTCCTCCTCTTCATACTCTTCATCATCGGCAGTTGCATCCTGGT
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment