Created
March 3, 2014 10:16
-
-
Save szabadkai/9322065 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# | |
# snipet for align fasta files in the direction of the longest ORF. | |
# usage: | |
# python align_fasta.py <fastafile> | |
from Bio import SeqIO | |
import re | |
from sys import argv | |
table = 11 | |
min_pro_len = 11 | |
''' | |
snipet to search for Open reading frames in a fasta files | |
(http://biopython.org/DIST/docs/tutorial/Tutorial.html) | |
''' | |
def find_orfs(seq, trans_table, min_protein_length): | |
answer = [] | |
seq_len = len(seq) | |
for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]: | |
for frame in range(3): | |
trans = str(nuc[frame:].translate(trans_table)) | |
trans_len = len(trans) | |
aa_start = 0 | |
aa_end = 0 | |
while aa_start < trans_len: | |
aa_end = trans.find("*", aa_start) | |
if aa_end == -1: | |
aa_end = trans_len | |
if aa_end-aa_start >= min_protein_length: | |
if strand == 1: | |
start = frame+aa_start*3 | |
end = min(seq_len,frame+aa_end*3+3) | |
else: | |
start = seq_len-frame-aa_end*3-3 | |
end = seq_len-frame-aa_start*3 | |
answer.append((start, end, strand, | |
trans[aa_start:aa_end])) | |
aa_start = aa_end+1 | |
answer.sort() | |
return answer | |
def trace_stop(orf_list): | |
max_len = 0 | |
for start, end, strand, pro in orf_list: | |
if len(pro)>max_len: #borzaszto szofisztikalt algoritmus a legnagyobb ertek kivalasztasahoz. | |
max_len = len(pro) | |
(max_start, max_stop, max_strand) = (start, end, strand) | |
else: | |
pass | |
max_len = 0 #erre lehet, hogy nincs szukseg, mondjuk artani se art... | |
return(max_start,max_stop,max_strand) | |
def trace_stop_context(record): | |
orf_list = find_orfs(record.seq, table, min_pro_len) | |
(max_start , max_stop, max_strand) = trace_stop(orf_list) | |
if max_strand == -1: | |
return(record.seq[max_start-6:max_start+3].reverse_complement(),'-1', max_stop, max_start) | |
else: | |
return(record.seq[max_stop-3:max_stop+6],'+1', max_stop, max_start) | |
handle = open(argv[1], "rU") | |
for record in SeqIO.parse(handle, "fasta") : | |
try: | |
(sequence, strand, max_stop, max_start) = trace_stop_context(record) | |
if strand == '-1': | |
print('>' + record.id + '\n' + record.seq.reverse_complement()) | |
else: | |
print('>'+record.id +'\n'+ record.seq) | |
except: | |
pass | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment