Skip to content

Instantly share code, notes, and snippets.

@diallobakary4
Created August 10, 2016 09:22
Show Gist options
  • Save diallobakary4/2d1ff8d4dd502e36ba80abb6b160fd06 to your computer and use it in GitHub Desktop.
Save diallobakary4/2d1ff8d4dd502e36ba80abb6b160fd06 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# Protein Translation Problem: Translate an RNA string into an amino acid string.
# Input: An RNA string Pattern and the array GeneticCode.
# Output: The translation of Pattern into an amino acid string Peptide.
from collections import defaultdict
import itertools
genetic_code = {'ACC': 'T', 'GCA': 'A', 'AAG': 'K', 'AAA': 'K', 'GUU': 'V', 'AAC': 'N', 'AGG': 'R',
'UGG': 'W', 'GUC': 'V', 'AGC': 'S', 'ACA': 'T', 'AGA': 'R', 'AAU': 'N', 'ACU': 'T',
'GUG': 'V', 'CAC': 'H', 'ACG': 'T', 'AGU': 'S', 'CCA': 'P', 'CAA': 'Q', 'CCC': 'P',
'UGU': 'C', 'GGU': 'G', 'UCU': 'S', 'GCG': 'A', 'CGA': 'R', 'CAG': 'Q', 'CGC': 'R',
'UAU': 'Y', 'CGG': 'R', 'UCG': 'S', 'CCU': 'P', 'GGG': 'G', 'GGA': 'G', 'GGC': 'G',
'CCG': 'P', 'UCC': 'S', 'UAC': 'Y', 'CGU': 'R', 'GAA': 'E', 'AUA': 'I', 'AUC': 'I',
'CUU': 'L', 'UCA': 'S', 'AUG': 'M', 'UGA': ' ', 'CUG': 'L', 'GAG': 'E', 'AUU': 'I',
'CAU': 'H', 'CUA': 'L', 'UAA': ' ', 'GCC': 'A', 'UUU': 'F', 'GAC': 'D', 'GUA': 'V',
'UGC': 'C', 'GCU': 'A', 'UAG': ' ', 'CUC': 'L', 'UUG': 'L', 'UUA': 'L', 'GAU': 'D',
'UUC': 'F'}
#Input: any sequence of string (RNA for example)
#Output: a list of constitutive codons (triplets)
def seq_to_codon(seq):
n = 3
return [seq[i:i+n] for i in range(0, len(seq), n)]
RNA_seq = "AUGGGAAUGACUGUCCUCAAUCGUAUCCCCGGGCUAACGUCUACGAUGGUAAAGCCCAAUGCAGUCCCAAUUCUCUCCAACAGAGGUACGAAGACUCCAAUUUAUAUUAAUCAGUUACGAAGCAGCCGGCAGUUAUCCUGCAUGUGUGAUUAUCACUGGGUUGCCUAUGGGAAGCUAUGUGUUAUCAUGACCCGCCCAGAUAGCUCCAUGUCUUCCGUAGCGGGAGAAGCUUGGAUAGCUGCAGUCGGGCCGUUGGGCCAGCGCGGAACGGAUUCUGUACUAAGAUGCACUGUAGGAACUUCACGGCCUGUCACUCUCUGGCAUGUCGGGACACAUACACAUAAUGCAAAUCUUCAAACGAAUGUUGGAGCGGUUCGAGAUCUACAAUUGCUGUUAACGAUUCGAUUGCGUUUUGGCGGCACAGGAUAUGCCCUGGGCGCGUGUUCCUCGCCGACUCGUUUAACUAAAAGUGAACCUGGCCUUGUGGUGUCGCAUGCCCUCGUUUCGAGAUCUGGCGGGAUGCGCACACUGGAAGCACGCAUGACUCUGCCUAGAAAUGCCGAUAGUUGCUUUAUCCCCCACAAAAUAGCGUGCCCGUCGCCGAGCGCGCUAGAAACAGUCACCGAAACGUGGCGCACCCCCAAAGCUAGGACCAAGAGCGCCCGAGAGAAUGUUCACACCAUCGUAGGCGAACAAAUUUACCACGAUAACGGUGCACAGCGUUGGAUCUCUCCGUCUCAGGCAACCAACCAACAAUUGGCUGGAUGGCAGGGUAACGGAGGGUGUACCUAUACAUUAGUGUGCAUUUACUGUUUACGAAGUGGGAGUGGCUUGGGCGAACUCAACCAAGUAGAGGCGAGCUUCUCGUCGCUGCCUCACAACGCAAUGGAGGUAGCGUUCAACGAAUGUGGCAAUGACCGGAGUAGCGAAUGCGCCAAUGUGAGACCACCGCAAUGGUCUGGGUUGAUGCACAGAGUGUCGCUUGGUCUGGGGGAUUAUAUAAUGACAGAACAGCAAUCGGGGGUUGAUACAAAUUUUCUUCCAGGGAACGACGGUGUUAAGAAUAUAGAGAGAAUAAAGACCAGGGACAGUGUGGUUCUCGGGAGGCCUACGUCGACCCCGAGGUAUCAUCGACGCAGACCAAAAGUCCACUACAUAGAUCGCAUACCCAUUGGAGUUCAUGAGCGCGCGGUUCGACGCAUGGCAGCUAGCACGCUUGUAUGCUCAAACCCUAUGGUUUAUCGGCUCGGAAUGAGGGGAAUCGCGAAAUUGAGAUUGGUGCGAAUGUGGAUUGACUUUGGAUACGGUCCGCGCGCACGCAUAAUCUUGGGACCACCUCUAGGCCUCAUCAGCAUCUCGUCCGGCGAUGUGCUUAUCCUCAAGCGAUCGCAAGCAUGUGGGUCCCUCUGUUAUGCGAUUUUGAGCGCACCAAUAGGGACCUGGUAUGAGUUGUUUUUUCCUACCCUAAUCGAGGAACUAAAAGACACGCACACGUUGAACCCAGAGACACGUAGCCUCUUAAGCAGGGGACCGGGGGAGGCUUUCAUUUCAUGCCGCUCCUUGGCGGUACGUCGAAUAGGUUACUGCACUGGCACCACCCCAACUCCACGGGUUAACUACCGUGAGACGACCCAUAAACAUUAUACUACUGUGCGCUGGAUCAACCGGACGGUAUGUCGUCCAGACCAAGACUUCGGCGCUUACGCCAACAGCGCGGCAUUUGUUGCUAGGAAAUCAGCCCUAAUGCGGCCGCGGACUUCAGUAGUGCCACAUUAUCCCAAUUAUUCUGCACGUCAGACAGUGGAGACUUUUGGCUUGUCCGGGCGGACACGUCUGAGCCUAAAGACGCAGUACCGCACAAACUUUCCGGCCGGUCCACUAUUAGUUAUUAUCGCGGAAACGUGUCAAAAUGGGGUACGGGGGAAUUGGUUGAUGGGACAGCCAGACCAAGCCACCACUCUAGAACAGGGCUACGAAGAUCCCAGCAGGAUAACAUAUGCGUUACAUCACGAAGUCAUAUCAGAAGACAGAGAGACCACCAAUUGUGGGGAAAAACUCAUGGGACAUAGCCUAGCUUUACAAGCUCACCAUCUACAUCCCCUAGUAAUUCUGCCUGUCCGGUUUACUGUCGAUCAUGAUUUGCAUAGACUGCACUCAACCACCUACAAUGAUCCAGAAAUCCUACGCCAGACCCCUCUACUAGGAAUGCUGUUCCGGUGCCCUGGUCUCAUCAGCUACGCGUUACCACUGGGGAUGAUCACAGCUAUCCCCUAUUAUGUCUGCCAGCAGUGGGGAGGAAACGGGAUUUUUGUCUCAAGAAUCUCAUCUGACGACGGUACCGAGUUUAGGGGUCCUCAUGAUUUCUUAGGCAGAGGCAAGUUCCCUAGUCCUCGCGGUCUGUUAAUCACUUUGUACGUCGUAGGUGGCGGGAACUUAGUAUGUAUCCGGUUGCGCGGGCCUCAGGAUAUAAUUAGCAAGGAUCCACGACUCUUGGCUGUAAGAACGUCUUCAGGUGGCUUGUCAGCAAUCACACUGCCUGCGUGCGCAUCGUUACUGCGGCCAACCGUAGUUAUUGGUGGCUACAAUACAUCGUACGAUCCCGCUCCCAAGGUUACAGGCCGGGGCUACAUACGGAUAAGUGCACGUCGCGAAGUGGGGCUGAUUCGUAUCCUUCGUGAGUUGUCUGAGACGGGGACACGUGCCAAUCACGGAAAAGAGACUUUCCAUCUGAUGCUCUUCAUGCAAUUCAAGGGGGGCCGUCAUCACGGAGGGUUCGAGGUCUUUGUAGAUACCCACACUCUUGAACGUCCCGCCGCAUGUAUCUACUUUAUGCAGAGUCGGCGCCCUCCUCUCAGACUAUCGGUCCCCUGCCCAAUCUUCCACCAUCGAUAUAUGGGCAUAAUUUGUGAGGAACCCCGGAAGGUGUGGCGUUCUAUUUUGUAUCCUCUCUGGCUAUUCUUCACGGCGUUAACAACCGUGGGAGUCGACAACAUAAAUCGAAUGAGAGGAGAGAACGUCAGGGACCGUCAUUCCGGGUCUUGGCAAAACCUAAUACGUUUCAGUAUGUCUAAUUCAAGCCCCUCUUCUAAGAAAGUACCUUCAGAGACAAUCCUUUGUCCGAGAAAUAUUGUUGGCUGCUACGCAUGCGUGCCUUCCAGGUUUCAGGGACUUUGGGGUCAUCACCGAGUUUUUGCACGGUAUAGCGCUGGACCCAUACCUAUGUCCACUUCAGUUAAGCAAAACUUGGGUAUGAGGUUGGGUUGGCAAGGCUGGCCAGGCCCUAAGCAACUAUUUCUUAAGAUAUACGGUGAAUCUUGCUGGGAUGCUGACUUACCACUCGGUGCGCAUUCCGCUUCGUGGCACAUCACAGAACGAGCACACAAUGAUAGGGUCAGUGCACCAGCGGGUGCUUUUCCCGUGGUGUCAAUUGGUUGGCUAGAAAAGUGUUAUAUCGCGUGGGUCGGCAUAUCGCGGUAUGCUGUCCAUGGCAAAAGGAGGAUGACUAGACGCCCCACGCCAGGUCCAGGACGGGGUUUUUACCUCUACAAUAAUGAGCGCGAGACCGCGCUGUCUAUCCGCUUCUCGCCAACGACACCUACGUUAGAACAGUUUCCCAGGUACUGUUGCGAAGGGGGGAGAAGCUCUCAGAUGGCGAGCUUGUCUAAGAGGGAAUGUUGCUCCACGCUUCCCGGAGGGACCCUGCUUAGAGGAGUGCCAAAACUUCUACGCAAUCCAGUCGAGUGCCUCAGUGCAUGUGUAAGGACAGAAAUACUAACUAAGACAUAUCGCUAUAGCGCUGGUUACGUGAUUAUCCCCAGCCGGGAACUUGAUCCAAGCUAUGGCAACGUCGGUCCGGUGAGGAACUCAUGCGGUAACUGGCAGCGCUCUUACGGGAGAGUAGGCGCCAUACACGAAAGCUUUCCCAAGCAUGACACAGAAAGCGUACGGGGGGCCCCUGAUUGUCGUCGUGUGUGCCUCCGACGCUACCUUGCUGGCGUCGUAGAGGUGUGUAAUAGUAACGGCGAGAUCUAUCAGAACCAACAAAACGACUGGAUUCCCGGGUCAUUCAUCGUACGACUACCGACUACCCGAGUAACCUUGAGUGUAGACCGCGCGGUUUGUGCUACGUGGUCGAAUCCAGUUCCGCAAACUGCAUUUGAGCAGGGCAUUGAACACCAUAGAAUAUGGUCGGAGAGGUGGCCGUGGAUAACGAUGCGGAUUUGUCUUUCCGGUACGGUGUACUUAAGCCUCAGUGCGCUUAAACAACAAAGCGUACACAUCAGGAAACACCACAGGUAUACGAAAGGAAACCCCUAUGCCAAAUGCUCAUGUAUUCGGCGCUCUCGUGCCCGAUAUGGGAAGCCCUUAAUUGGCGUUUCGCAUUUAAGAGAAACGCGUCAGAUGUUGAGGUCCGCAAUAGAGCUAUUUGCACUACGGGGUGCUGGUAGAAAGGAUCGAUCCGACGGUGUGCAGUUGCGCCAGGCUAAAGUCGACGAGGAGUUGGCAAUACUACUAGCAGACAGAAAUACUAGUCUGGACGGCGCGCUAUUGACCGUGCCUAUCCCUAUUCGCCCCCUUAACUUAGGUGGCACAUACAGCGACUUUUGGCGUGGACGUAAUUACAUCUGGCCUAGAACCGAUGAGUCCGCAAUGACAUUCAUCGUCAACCCCUCACCGUUCACGAACAGUAAGAGAAAGCCGUACCGGGGUGUUGUGGCGAAAAAUCAAUUACCCAAUGACUUCACUGCUUUUGGAGCCUCGCGAGACAAUGUUGGGGUUGCGCUCCUACGGCCAGUGACAGGUAAUCUUUUCUUUGUCCAUUCUCUUAAUUGGUCAGCAACUGAGCAAGAGAGAAACCAACUCUCCCAUCGUCCGAACGAUCAGGAUAAUACGGAGCUUUUCAUGCGCACUAGCCGAGCAACGUUUUGCCUCCGGGACGGUCGGAGGAUCCCCCACCAUUGUAAUGUGGCUACACCAGUCCUUAGAACCAUCGGAUCAAACGAUGGUAGAUACGCUCCGCCCUAUUCUAGGAGAAGGGGAAUUCUGCGGGCACGCAAAUAUUCUGUUGGGGCCACCCGCGAACUUUGUGAAAGCGCUUGCUCUCGUGUGAACGAGCGUACGUAUAUUAUAUUUGCUACUUGCCUCCGUAAGCUGGUAUGGCAAGGACUUUCCAAGGGUAUUUUAUCAUCCACUCCGAGGCUGGCCGUUCUUCUCCGCCUGAAUAAACUACUCUAUUUAGAACCCAAUGAGCAUUCUGGAAGGAUGCUACCGCGAGCCCUAGAAGCUUACGUGGAGGCCGUAGCGGCUUCCUUAUCGCAAAAACCCACAAGACCCCCGAAUCAGCGUCCUUCAAAUAUCCCGAUCUGCGAUUCCAUCCACCCGAAGUUUAGUGUAAAGACAGUUAACCACGCGUUGAGGAGCGUGGGUAUGGUACUGUCCAUACUAAAUGGGGGCGGCGCAAGCGCGGCUGCAUGUCAGAACCUAACAACGCCUGUUACGCCCCGGGGUAAGAUUCGUUUCCUUCGGAAAUUCUGGACGUCGGUACGUAGACUCCCCACUAAUACCAAGUGGAGUCUUACUGAUGGGUUAGGAAAUUCAGUAUUAAGGCGGAAUUUAUUGAUUGGGCGUGUAUACAUACACACCAUCGAAAGCCCAUCCCGGUAUAUACUCUUGAACAGAAGGAAGCCUGAGCGGACUCCUACCAUCUGUGGUUAUGUCGCUGUGCUAUGCUGUAGCUAUUAUUUCGUCGUUGUCGUUUGCAAAGAUGUAACUGGUGGUUCACAUGUGGGCCACAUGGGUCAGGGGGAUCACGAUAAAGCGCGUGGCGAAACGCGGUUGCCUUCCAAACUGCGAAGGGGACGAAAUGUAUAUCCGCCUUCUGAUAACAUUGAUAUCCGGCAUCAUUGUGAUGGACUGAACGCGCGAGACGAGAAUGACAGCCUACGAUUCUCUGGCUUAGAUCUUUCUCCCCUGAUCCAAGAUAUCCGAAAGGGCGCCGGUGGGUUGUACGUAACGCAUACUUUCUAUAGGCGUUGUACAGGGGUAACUGCCUGGGGCGGUUACCGUGCAAGGGAGUCUAUCAGAAGGAAUCGGAGUAAUACAGGCACUUUUGUACUACUGAUACAGUCACGAAUCCGCUUUAAGCGCUCUCUGAGGUUAGAAUGCUCCACCCGAGGAAGACGCAUCAUUGUACGCUACAAACCAACCGAUUUCGAUCAUUUGCUUGCCCUGCUACGAGGUGCAGAGUCUGGCCGGCUUCGUCUAUGUCAAGCCCCGCUCGGUUAUCUGUGGACGGUGUGUAACAAGUCUUUAGUGAGAAUCUUCCCGUUACCGCUCACACGUAAAGCAUCGAGCAGGAAUAAUAGAUUAAGACUGAUUCCUAACGAACAGCUAACCGGCGCCAUCAGCGGCCCAUCCAGGGAGGAUAUUCGCGAUGGAACCCCCGGGUCUUCCGGCGUCGUAGAAUGCGAGCCGGCCAGGUCCCAGAAUGGUUGGUGCAGAUUCCGAUUUUUCCAACGAGUACGCGCUAUCCCUGUAAGUGGCCCAAGGACUCGAAAGAGCGUCCCCCUGACUAAAAGCGGAGUCAGUUUACAGGGACCGGGUGAGAACGCUCGUGGGUACAAACCCUUAUUACGCACUCCAUCCAUGAUGUCUCUGAUCACGGUGGUCGGCUUGAUGCACUGCCACUCGGUUGAGGAUGGAACCUACAGGGGAGUUGUCAAUUGUGUAGGAUCGGCAUGUAAGUCCCUAAUAAGGCAAGUGCCACUAGGGUCCGCUGACGCUCGGGCCCCCAGCUGGGCGGGCCAAGAAAUUUGUAUUACUGGGUUCCCUUUUCCGCUAUCGUCGAGAUACAGGCGAUCGGGGUGUCGUUCAGAAUGGAUCGAACGCGAAAGGUUCAAAUCGCUCAUGCAUGUCCGUCGGCUGGUAUUUACUAGGGCGUAUAGACCUCCAAACACGGACUGCAUGUCUCGAGCAUUGCAAACUGGCGUAAGGUACCUAGCACAUGGUUACAAGAUCCGGCCACCCGUCCACCAUACCACGAACGGCCGUCUAGUUUCUCGACGACGGUAUAGCGGCUGUAUAUCAACCUUGAAGGAUGGAUUAAGACGAGUGACGAGCAUUGACUGUCGAGGUCAUCAAGGGCCGGUGGACCGGAGACGGGAGUGUCAGGCCUGCCUCCAGGCAUUAAGAGUUAGGUUACCGAAGCACGUCAAGCUCGCUUUAUCACCUCCAUCUAUGCGGCCACCUAGAGGUUCUUUCAUCCGACGGUGGCGAACAAGCUACCGGGAGGUCAUCCAGAUGAGUCAUUUACUUCGAACGUUGAUUUGGCACGGCGAAUUAAUAUCGUUAACGCAAAGCGCGGUACGGGCAAUCAUAUACAACGCUGCCCGAGAUACCCACGUCGCCCUCGCGACUACCAUGGGCAGGCGGUGUCGCUUCAAGCACUCACACAUUUUAUACCGCAAGGGUUGCGCCGGAAAUGCCUCGGGACCGAGCAACUCACCCGAACAACAGGGGUUACUCAGACCGCACAGGAAACUGAGCAAGAGAAAUCAUUUUGUUAGUAUAUCACUAGGGGUUCAGAAUCGACGGCGUCGUACCAGUGGCCCCUUAUGCACGCAGCUGGCACAUCGGCUUCUCGCGUUUAGGCGUCCGAGUUUUGGACAGGCAAGCAACCAACUGAGAACACGAAAGGGGCGAUGGCUCCUACCGCCACCAGCCGUUCAAGCGUCCUGGAAGUCCUGGUUUCUUCAAGGAUGUUAUGGGACGCACGUUAGUCGCUUAGUAUGUGGCCCUGCUCUCCGUCUUCCUGGGCUAACAAAACUGCAGAUAUUAAUUCACAUUAGAGUCGGCCGUGUUGCAGUCUUAGGUAGGCUUGCUGCCCCCCCGGUGAUAAACACAUCGGGCUGGGGGGCAACAAUAGCUAUCUUGCUUUGUGCAUGCGAUUCGCACACAGUCCUUACUAGGGGGCUGUCAGGUACACUAUUACACCUUACGAUCACAAAUGGACGUCUCUCAUCGGGAUACAAAAACCGGGAUUGUAGUAAGGGCACCUUGGUCGAAAACCCAGAGAGCACUAAAACUCAGAGUCGCACUCAAGUCGUUACGGAGCUUCGCGGCGCGAGCACAAUUGUUGCGGGUACGGGUCGCUCGAUAUCUGCACCUGCCCGAACAAGCAUAAAGUUGCCUGAUGGGCGCCGCGUCGAUUACAAAUACGAUUGGGUUCUUUUGUAUACCCGCGUGUUAUGGUUGGGAGGUUUUAAUUGGCCUGCACCUACACCUUGCGCCCACGCUACCAUUAGCGCCCGACGCAUCCCUCACCAUGAACCACUAUCCCAGGCUCUGAGCACGGCAGAGAAUGAAUCAACAACCUCGCAGUCUUACGGGCGCAAACAAGUUCCACCGCUCCGCACCUCUGUGGCUGCGGUACUGAUUAUAUACUCGUUUUGGGCGUCAUCACAACACACUCGUUCCAACUUCCGGACAUUGACGCGCAUGGUCGUGGCUGUUGCCAGAAUGCCUCCUACGGUCGCCCGGCAACGAGACGGGGAACUGUUGGCCGGUCCAACUUCUCGUCCCCUACGCGGGUUGUGGCAGGUGCUCCACCACACCGGGUCCUACAGCUGGCUUCAAGUGUUGCGCGACUUGAGGCGUUUUUGCACUCUCUCGGUACCUCGCCCUACCGCCGCAGUGACCUCGGCGUACGAUUUGCUACAUCCCGUGACGCCCUCUCUACCGACGCCGUUUGUCAAUAUUCUUCCGGACACCAGCAUACGGUGCUGA"
#Input : a sequence of RNA
#Output : the sequence of protein
def codon_to_protein(RNA_seq):
codons = seq_to_codon(RNA_seq)
protein = ""
for codon in codons:
protein += genetic_code[codon]
return protein
#Function testing
#print codon_to_protein(RNA_seq)
#Input : a sequence of protein in one letter abbreviation
#Output : number of possible RNA string codons (all codons)
def protein_to_DNA(Prot_seq):
# building a dict of aa to codons {'C': ['UGU', 'UGC']...}
aa_to_codons = defaultdict(list)
for k, v in genetic_code.iteritems():
aa_to_codons[v].append(k)
#total number of possible RNA sequences
total_RNA = 1
#All possible RNA sequences [[],...]
codons = []
for e in Prot_seq:
total_RNA *= len(aa_to_codons[e])
codons.append(aa_to_codons[e])
return codons
# Given: A sequence of amino acid, a protein
# Return: All possible RNA sequences of the give protein (list of strings)
def all_Prot_RNA(protein):
# list of possible RNA molecules for a protein sequence
possible_RNAs = [ ''.join(x) for x in list(itertools.product(*protein_to_DNA(protein))) ]
return possible_RNAs
# Given: A DNA string ss of length at most 1000 bp.
# Return: The reverse complement scsc of ss
def DNArc(DNA):
# In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' and 'G'.
# The reverse complement of a DNA string ss is the string scsc formed by reversing the symbols of ss,
# then taking the complement of each symbol (e.g., the reverse complement of "GTCA" is "TGAC").
rvDNA = "" #reverse DNA that will be returned
for base in DNA:
if base == "A":
rvDNA = rvDNA + "T"
elif base == "T":
rvDNA = rvDNA + "A"
elif base == "G":
rvDNA = rvDNA + "C"
elif base == "C":
rvDNA = rvDNA + "G"
rvDNA = rvDNA[::-1]
return rvDNA
# Given: A DNA string tt having length at most 1000 nt.
# Return: The transcribed RNA string of tt.
def DNA_to_RNA(DNAstring):
# An RNA string is a string formed from the alphabet containing 'A', 'C', 'G', and 'U'.
# Given a DNA string tt corresponding to a coding strand,
# its transcribed RNA string uu is formed
# by replacing all occurrences of 'T' in tt with 'U' in uu.
#The resulting ARN string from transcription
ARNstring = ""
#changing the T in U to make a new ARNstring
for e in DNAstring:
if e == "T":
ARNstring = ARNstring + "U"
else:
ARNstring = ARNstring + e
return ARNstring
# Given: A RNA string tt having length at most 1000 nt.
# Return: The transcribed DNA string of tt.
def RNA_to_DNA(RNAstring):
DNAstring = ""
for e in RNAstring:
if e == "U":
DNAstring = DNAstring + "T"
else:
DNAstring = DNAstring + e
return DNAstring
# Peptide Encoding Problem: Find substrings of a genome encoding a given amino acid sequence.
# Input: A DNA string Text, an amino acid string Peptide, and the array GeneticCode.
# Output: All substrings of Text encoding Peptide (if any such substrings exist).
DNA ="GCGGTTTTGCGCATTAATACCGGTCCGCGCCCGAGCGAGAAACGGGAGGGCAGTTCAACGGGACTTTTGTGGTTCCTAATTATTTGAGATAAGTAATCTTCCTGCAGTTCGTGCATGGCGTCCCAGAACGGTTTAATCAGTGATTGTAAACTCATGTTTACTACTGTGCAAAACTAGGACCATCTCTATCATTGAAGCACGGTCGAAGCGGCCACATCTGTGGTATCGCGAGCCAGAGCCCTAGGCGCGAAGTTTTGCATTGCACCGCGGTCTCCTAGGAGGTCCAATGGGCTGTGGCAGGCCAAGATGCGTACACGTATCATGCGTCAAAAGTGCCTCATTAGGGTAGAATGGTTAACGGCTTACTTCGCCCAGGAAAGGCTCCGTTAGGCCCGTGCCTATCAGTACCTCGTAAGCGTGTTCCCCGAAGCTTTACGGCAATTTGGCCCTGGGAAGAGACTGTCGTGTCCACGACGATCCCTCGAATTACATTGTTATGCGCCGTAAGGTTCCGAATTTACAACGGCGGTGACGCTGCAATTAGCCAAATCAGTAGGGGGTGGTGAGATGCGGATCCGTTTACCCGCCAGGGCTGGGCCGCTTGGCATCATGTGAAATTGCATTGCGTCACCAACCGATTTTGGATTAATGCTTGATACCAATGGTTAGAGTCGCAGTTTGACACATCGAATGTCACGGGAGACTACCGTTGCTTTATTCGAAGGATGAACCTTGGTGAATACGTGCTCCGAGGCATGACAACTAAAAGGCGATAACGTCATGCACCTGGGCCGACTATTGGCTACGGACTCACAGACTTTTTAAGTCACGGGAGTCCGGGTGTACGGGAGGAATCGTTTATACTCCGTACATCAGGCTTGATTGCCGCGCCAGGCCATCACCAAAGTAAGGCCGCCTAATCTTCCCCGACCTAAATCTGTAGGCGATGCGATGCAATCCGATTGTGGAGTAACGCGTTCGCCCAGGCCTAAATCCGTGGGTGATGCGATGCAAGCCGCGACGCCCTTCAGTCTGCACTAGACACACGTGTGAGCCAAAAGTAGGGCGGTTTAAGAGGCAAAATATGGCCACACCGCCACCGTTCATTTAGGAACGCTGTCACTAATCGCTTCTGGAGTGTTTCTCCATCCGATCTCCGTGTTAGTTGATGAAAGGTAACCTGTTATTCAATATAACGTTGTCAGCTTCAACGGGGAGCAAATTGCTAGCTTGTCATTGTGCCTCAATGTATAAGTGTCCGACCAGACTGGAGCATACCACCGGCGGCTTATACCCACTACCTTATCCTCTTCACTGGTACATGTAACTATGGTAGCACGGGAGTAATCAGCGCTCCATACAGTGTCGCTCCGCTGATATTATAAAGCTGTAGGAGAAACTTAAACGACACAACTTAGATAACACCTCTCGGGCAATGAATGGCGGCCAAATACCAGAAGCAGCTCCACTCGCTGACCCAACCAAATAATATTGTACTCTGGTCGAGCTCTTGTCCTGTTTGAAAACGATGGGCGGGCGCGTTTCTGTAGTATATACATTGGTTGGCGGCTGTATTGCATTGCGTCCCCTACGCTTTTGGGGCTGTTCTAATACTTCCCGTCTGTTCAGTTGAACGGAGGTAAATTTTCCAGATTAGGCAACTGCCCAGGACCGCGCGCTCGCATCAGGAAGGACAGCATCCGGCCTCCGCAACCAAAGAGTGTTGGCGACGCTATGCAACATACGGTGCTTCGACGGGAGGATCACCGTTTGGCACTGCCGAGCCAGAGAATAGTAGCGGCCGGCCAAGAGTACGCATACACTACCGGATGTGTGGTCTGATGTAAAGGTTGGGCTGAAAGAAACATCAGAGGTTTCCTATCGTTCAAGCAAATTCATATCAGAATGTGTCCTGAGGAGAGCGGGCAGGCGACTTATGACAACATGTGATCGCTAGGCTGACTCCTCGAGGGCGATAACGTGTGGCTTCTAAGCCAGATTGACTCGAGAGAGGGCAAGCACTAAACGAAGTGGGTGCAGATGTGACTGCATGGAACTAAAATGGAACTAATAGGATTAGTGGTGCTGAGTTTTCCCAGGTGTACCTATTGAGGCAGGATTCCCTGTGTCTGATCCGTCTTGCCATGGTCAGGCCGAGTCCATTGTGTCCTAAACTCTTGTTTACTAGTTCAGGCAATGCTATCTGCCGAATGGTGAGGTCCGCCGCCTTGCCATGTGGGTGTACGCGATTACCCGCTCGGTTACCGCAGTCGACATGCAAGTCGCGTCTTGCGATACAATATCGCGCCGATGAAAAAGACCCAATATTTTCCCCGCACAACTTCCATATAGGATAGAGCAGCTAGTTGATTTAGAGTATGAACCGAAATCTGTCGGAGACGCAATGCAGCAAATGCCAATCCTCTGAGAAATTCACAGTACCCCGGTTCCCGAAGAGTGTAGGCGATGCAATGCAGAAAGAAAAGTTCCGACCACGGTCGGTTAAATGCATATTCGGCGGGTCGACATTTGAGCCCTGTTACGATTTCAAGCTCTGGACGCGGCCGTTAGGCAGACGGAGGTACGAAGCCAGCCTTGCACCTGCTCCATCCTGTTTGGTGGCAGGGTGCTGAAGTTAACATATAAGGGGGTCAATCGTGCGTCTACAGGTCGACCGAGTCAGCGGATCTTCAGGCCACTACTTGATGGGTGAGCTCTTCATTGTAGGCGCTTCTACTTGCAAGGGTGCGTATCGGTATACGGCTGCATCGCGTCACCAACGCTTTTGGGTTGGATACCCGCTGGCGCGCATGAAAGGATCCCCTTCATGTCCAATGTCAAAGGGACATTTCCACATATAATCTTGATTGTGACACGCATCCAAAAATAAAAGAGACGTCACTTCCGAAACCCAATACCTGACAAAATCTATACTTCAAGTTGACAATCTGACTAGAAATTATTGGGAAGAAAACCCGTTATTATGGAGGGTCCGTGGCGAATGTATGGAAATTTATTAATCAGCAATACCAGTGGCTTGAGCGACTGAACATCTTTGACCCGTCACCGATTCGCCGGAACGCCGATATTCGAAGGGGTTAGGTTGGATGCCTTCCCTCCTACTGACCCCAGCCGGCAGCGCAATGCAGAGTGTGCCCTGAAGTAACTAGTATTGAGGGATTGTATACCCTATCGCGACAAGAAGACAGAGAGCGAATATAAGCCAAGGCTTGAACACCCCTGGCCGCTCAGATGCTCGATCGTTTTTGAAACGTTGCATCGCGTCTCCAACACTCTTAGGTAACGCATGTAAGCGATGAGATCGTCGCACCCATCTATGATTCGCTTCACGCGCGCATAAGGCTCGGTAGCTTTTCCGCATATCAGGGTTCGGTCTTGTCACTGTTTATCAAGGTCCATTCTATCGTCGATTTACAAAGTCGGTACCCGGCTTGTCGCGGATAACACGGCACCTAGGTGGACGACCCGGCTCACATGGTCACGCCTAGAGGCGCAAGATCGAGTGTAACTGTGTTTGGTTGTGGGTATAAGACACATGTGCCCGAGCAGACGACTTTCTTTCCGAAGTCGGTTGGAGACGCCATGCAGTGTTAGTAGAAACAGGGATACTTAGGGTCGCCACCCAGCGTCCTGAGAGTCCACACCTGTCCATGGATAATTAGTACTAACTTTTATACGTTGACTAATCGAAGCTTGACAAAGTTTCTGAATCGCCTTGTGACCGCGTTTGCGGGAGTTACTACGGATGAACTAGGATTGCGTAGACGTGTTCCTGTGTACCCTGCATAGCGTCGCCAACTGACTTGGGAACAGTTAAGTTCCCCCATCTGGTTAGAAATTTTTGGATGGAAATAATAGCTTACAGAAGAACTCCCATAGCACACTATCTTAATGACCAGGCCAAGTCCAAATCGATAGACATAACGTCAGACAGTCATGTTGGGGTAATATTTGGCACGTTGATTTTATCAAGTTACAGGTTCCCTCTGGCTCTCCTGTAGATACTTAACACAGTCACCCTTACATAGATTCTTAAGATGCTTTAGTCAAAACTAAAGTGTATGTGAATTGGCGTCAAGCGTGTATTGGGAGAGGTTACCTTCTTGCCCTATGGGATGTTTGTTGCATACTGAACTTATTGTGATGCATATAAGTTCTAGGAGGCCGTATTGTCTTTGCCAGTAAATCAAGCTGCGTTTAATTGTACCGAAATCCGTGGGCGACGCAATGCAGTCAAATGTGAAACTCCCCCTTCCACCGGCAGCACTTAAGTTTCCGTCGGACTAGTGTACAAGACCGTATCGAGGATAATCTCTGGTAATCGGAAATCTCTCAAAGCAGGACTGACCGAAACAGCGAGTTGGCCTATTGAGATGCATAGTGTAACTACTGCAGACCGCTGGCCACTACAAACGACTATACCAAATGTATATAGAAAGGTAGCTCAGTGTGGCCATTAAGTGTTTATCGCTTTCATGAGCCCTAGTCACGACAAACATAACTCATAGCATGATAGCTTGAATATCCGAAAGGGTACTCTTTGCAACGGTGATTATTTAGCCGCGACTCGACGAGTTTTAACATAATTCGTATTGAGACCAGGGGGACTGTAGGCCGTCACTTTATACGAGCGTCGGTGCGCCTCCGCCTGACGTCACACTCGTGTTCCCCGATCCTTCGAGAATAATGACAGACGATATACTTAATTCAGCTAGATCCTATACTCACTATGGGACACATCTGCATTGCGTCACCAACGGACTTAGGTATGCGCAGGGTACTACTAAAGTAAGCGGGTGTCAATGCCAGGAGAATGACATCAAAGGGGAGGACAGTCAAGATGCGAAACCTGCAAGGCTTTGTCTGACTGCTGAGACCTGATAAAGGTCGGCAGTCTTGAGGCTGGGAAAGCTGTTGGGCCTCCTCGCAAGGGAAGAATGGTACCGCCTTTCCATGAAGAGCGCGCTCATACCGAATCTAAGTTGGGTGAGTGACTGTAAAACGTATGACCGAAACCTGTGAGGCCGGGGGGTTGCATAGCATCTCCAACCGATTTGGGCAACGGCAACACGGTCTCCAGGGTGGTATATAGTCCCTGTCTTTTCCCGAACTTTCATGTCGTGTAAGCGACTCTTTCGGTTATTGAACACGATGGGTAGACGCGTCGCGCCACATGAACCACACCTTGGAATGCTCGGAGTCCTCATACGCACGCCGTGGATTCTCTAGCACCAAGAGATGACATGGACGTTCCGAGGTTCATTTCTGAAAGTCGTTTTGTCACCACAGCAGCACCCTGGGCTGCATGGGAGGAGAGTCATTAACGCAGAAGTTTGCTTGTGCTTCCTATTAGGCACCATAGCGATGTACAACCCTCGGCAGCTGAGATGTTCGCCCTTTGGTGTACCTGCTTACGTAGTCCCAATCAAAAAAAATATAGCTGCGCGTTATCATTGCTGAATACGGAATGAGCGGGGCATCGGTGTAGTTGTGAAGTACCGTTACCCAGCCTCCCCGAAAGTGTTAGCGGGCAAACGCTATGTCAAATGTTCATCAAAGTCACTCAATGCCGAACTTGACTGATAGGCTAGAGTGATGGTTACCTATCTAAGTCCTTATGTATGAGGCCTGATGTTACTAAGGTGCAGTAGTAAGGGGGACAGGAGACTAGCCCAATAGACTGGTCGACCGCCCCTGGGCCGTCCCTACTCAGTCACCTGGCCAACATGTCTCCTAGGCCAATATGTAGTATTAGTTTGGGTTTGTAATGTCAGCTTGTACTCGTTATCGTCACACGCGCCCACAATTCTATCCGTCTTTAGGCGACCAAGGAGATGTATCTTGTCCCGGGGAATCAAACCAGATTACTCCATGTAGTGCTGTGCGCTTAAATGGTTAAGCAATGATTAGTTACCCATACAGCGCTTTGGAAGCCGGGCGAACATTTTACCTCTATTGTGACGAAAGTTGGTGGGAGTATCTGGTCCTCGTTGTAGGAACCACTAACGCATACTCCGTATCCCTAACGAGTGAGAGTGGAGAGACCGGGACTACATCATGGTCTAACCAACACAGGGATGGCAAAAGGCAGCCACTGGTTAAGGACGCGCAAAGAGGCTGTATACGCAAACAGGGGCTATAACGATACCACGAATGGCCGCAGTCCAGATCCCCAATTAATGAGCGGGTCCCTGTAAGGGGCTATGCCAGGCTTAGGCGTCTCCGCTGCAAATGGATGGCACACATACCTGTGGGGAAGTGGAACTGAGGGCCAAGACGAGAACCCGTACAGGGGACATAAGTACCCCGCTGTTCCAGGCTCTGACCGGCGTGATGTAGGAGGAGCCGAACGACAAGTTGTACAAATCTCCAACTTTTGTAAGCGTCCGAATCACTTTGGTCCAGATTAGGTAATCAGGCATGCTCTGAGAGGTCGTGGTAAGTGTAGACTATTCAAGGACGGTAACTCATGGGAGACGGGATAATAAGCGATTCACGGGCATCGGCCCATGAGTGGAACGATCATACAGCTGCAATTGGTGTGCTGTAGCGCAACAACCGAACACCCGCGCCAAGTATTGCCCGGTCACTCTCTCCGTACTGGGCCTACTTTTAGACCACACTGAACCGATACCTTTGCATTGCGTCCCCGACGCTTTTCGGACCCCCCTGGGCGGCGTAAGATACTCCCAGACATTAGCCAAGTATCCATGGGGCTGGATTTAGCCGTCCAGTTTCACCTCAGATATCAACCAGCAGGGCCCTTGGGTTCCCCCTAACCTGGGTTGTGCAACTTGAACTCGGGAAGGGTCGAAGTCCGAACCCCCCCAGGTCGAGCCCACACGACGTCCTTGTTATCCCAGTGGGTTCATCAACGCGATTCACTGGTCCTGGGTGGCGATTTACCAACATCTCTCAATACGCACTAGGCGCGCCTGGCCACCTGAGGACGCTTCCTTTCCGCCTCATTGCTGCCACGAGCCCTACTCTGCACTTAATGTCTGGTATTTCTCCGCGGTTCCCTGCCGACTCCCGTTGGGGGACTACTTTCAGCAGTTCTCAACCAAGAATTTACGGGTTGAGCGACCGTAACACGCTGCAAGGGGTGCGGACACATTATGTCTATCTCCTATAGTAGAGGAACTATTAAGACAAGGGATTGCTACCTTTTGTGGCCCCGTGATATGCAAGAGATTTTGTTCGCATCTTGCTCCGCTTAGAATAACAGGGGCGGTAGGTGAAACCTTAAGAAGGCGGTGTCATGCCTAGCCGAAAAGCGTTGGTGACGCTATGCAAGGGTTGGCCCTCGTCAAGGGTCCGGGGTGCGGTGCATGACTGCTTAGCCGTATCTCCCAGTTGAAAGCATGGCGGTCAATTGTGTAAGGTACCTTGCCTAAGAGCGTGGGAGATGCAATGCAGCGACGGACTTGGGTGTTTAGGATCGCATATGACAAACAATATTCCTGTGAAAAGGCCCAGCCTCTGGTACGACGTGGCCTAACTCTTGCAACCGGCTTGTGATGATTTATAATTTAGGCAACAACCTTCGCTGGTTTTCAGAGGATTTGCAAGATAGTGGTGGATCGCTGCCCCCATTCAACTTGGAGCGCTGATCATGAAACCAATAATTCGTCCTCTCGGAAATTGTAGTCTTTTGCTCCCGTTTCTCGTAATGCGCCGACCGGCATTAGTATGGCGGGGAGTTACCCCCGCGGGGAGTCCGTTAGGCATTACCTGCGTGACAATGGTCGGAATATTGCTATGGTGACCCACAACGGACGGAGTTTCACAATTGAGGATGGGATTCCGTTTTAGTCTAGGTAGGAACGCAATACAGTTAGATTGCGGTACACCCAAGTCGGTTGGCGATGCGATGCAGGCGTTTGTGATGGTTCCGACAAATTGGCTGTGTCAAGATTATATCCGTGGGTGATCGGTTAGCCTGTAGGATAGAGGCTCCTAGGTGTCCACGTGACAACGCTAACCAGATACAGTCACAAGCGTAGGCGGCTGTGTTCGCATTCGTGATGCTATGTATCCCTGTGTGGCCTTTTGTGTTGAACTTCCAAACGTTGGGTCGGTTCAAACTGGCGCACACACATGCCGATGGCCAGCCGCACTAAGGCACCGAAAAGTGTCGGCGACGCCATGCAAGCACATGTGGATCGTTTCCGTTCTTATTGCTAGTGCTCACTCCGTGAGCGCCAAAGTAGAACAGTTTTCTTACTTACGGCCCGCCGAGATTGTGAATAAAAGGAAAAGTGACTCAGTATTTGGCGATCGGCGACCCACACATGCAACTGTCTAGCCTACGATCGACTATTTGGCAATGGAGTCCACGAGTCGAGACCAGGGTGCGGGTGTAAGTATCTCGTATGATACTGTA"
protein = "VKLFPWFNQY"
def prot_in_genome(genome,protein,genetic_code):
genome = genome
protein = protein
genetic_code = genetic_code
# find all possible RNAs for protein
RNAs = all_Prot_RNA(protein)
#transform them to DNA
DNAs = [ RNA_to_DNA(e) for e in RNAs]
#add their reverse complements
rvDNAs = [DNArc(e) for e in DNAs]
DNAs = DNAs + rvDNAs
print DNAs[1]
#Checking each possible DNA in the genome
# list of all motifs found
encoding_DNAs = []
for e in DNAs :
if e in genome:
print "Yes"
# I should all occurence of a motif
[encoding_DNAs.append(e) for i in range(genome.count(e))]
return encoding_DNAs
# with open("Bacillus brevis genome.txt", "r") as data:
# genome = data.read()
# genome = genome.replace("\n","")
# if "\n" in genome:
# print "yessss"
#
# with open("motifs.txt", "w") as result:
# result.write(genome)
#
# motifs = prot_in_genome(genome, protein, genetic_code)
# print len(motifs)
# with open("motifs.txt", "w") as result:
# for e in prot_in_genome(genome, protein, genetic_code):
# result.writelines(e +"\n")
# How many subpeptides does a cyclic peptide of length n have? : n*(n-1)
# key: the single letter aa, value the mass of the aa
aa_masses = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113,
'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97,
'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
#Input: An amino acid string Peptide.
# Output: The linear spectrum of Peptide.
def linearSpectrum(peptide):
# Peptide = NQEL, PrefixMass = (0, 114, 242, 371, 484),
prefixMass = []
prefixMass.append(0)
for e in peptide :
#add the mass of a to the mass of all precedent peptides
prefixMass.append(prefixMass[-1] + aa_masses[e])
# list of the masses of every subportion of the peptide
linearspectrum = []
linearspectrum.append(0)
for i in range(len(prefixMass) - 1):
for j in range(i+1,len(prefixMass)):
print i,j
# todo maybe we should a dict with key: the subportion of the peption, value: its mass
linearspectrum.append(prefixMass[j] - prefixMass[i])
linearspectrum.sort()
return linearspectrum
# Generating Theoretical Spectrum Problem: Generate the theoretical spectrum of a cyclic peptide.
# Input: An amino acid string Peptide.
# Output: Cyclospectrum(Peptide).
def cyclicSpectrum(peptide):
# Peptide = NQEL, PrefixMass = (0, 114, 242, 371, 484),
prefixMass = []
prefixMass.append(0)
for e in peptide :
#add the mass of a to the mass of all precedent peptides
prefixMass.append(prefixMass[-1] + aa_masses[e])
#Total mass of the peptide
peptide_mass = prefixMass[-1]
# list of the masses of every subportion of the peptide
cyclicSpectrum = []
cyclicSpectrum.append(0)
for i in range(len(prefixMass) - 1):
for j in range(i+1,len(prefixMass)):
print i,j
# todo maybe we should a dict with key: the subportion of the peption, value: its mass
cyclicSpectrum.append(prefixMass[j] - prefixMass[i])
if i > 0 and j < len(peptide):
# Mass(LN) = Mass(NQEL) - Mass(QE) = 484 − 257 = 227.
cyclicSpectrum.append(peptide_mass - (prefixMass[j] - prefixMass[i]))
cyclicSpectrum.sort()
return cyclicSpectrum
for e in cyclicSpectrum("NTKDKHAHILYNTRC"):
print e,
# The brute force cyclopeptide sequencing algorithm BFCyclopeptideSequencing generates
# all possible peptides whose mass is equal to ParentMass(Spectrum) and then checks
# which of these peptides has theoretical spectra matching Spectrum.
#
# BFCyclopeptideSequencing(Spectrum)
# for every peptide with Mass(Peptide) equal to ParentMass(Spectrum)
# if Spectrum = Cyclospectrum(Peptide)
# output Peptide
# max_number_aa = totalmass/lighestmass_aa
# min_number_aa = totalmass/highestmass_aa
# list of all combi with min_number_aa ,possible_RNAs = [ ''.join(x) for x in list(itertools.product(*protein_to_DNA(protein))) ]
# keep track of those that do match mass (are less than mass)
# then continue making combination with them to reach max_number_aa
# for number between min_number_aa and max_number_aa:
# range (max_number_aa, debut, fin,pas)
# mass(combinaison number aa ) = ? mass_peptide ?
# possible_RNAs = [ ''.join(x) for x in list(itertools.product(*protein_to_DNA(protein))) ]
# same spectrum ?
# keep track of those who are not same spectrum
# output
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment