Skip to content

Instantly share code, notes, and snippets.

@mandyRae
Created May 11, 2016 18:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mandyRae/d3cd96f8c5aca856ea5696150ef5b834 to your computer and use it in GitHub Desktop.
Save mandyRae/d3cd96f8c5aca856ea5696150ef5b834 to your computer and use it in GitHub Desktop.
code to parse and analyze nucleotide sequences and genetic mutations, bioinformatics research project
import data
#sequence = 'cgauggccuaaguuaaagcauccaagcguagauaauagugga'
'''
Functions defined here:
convertDNAtoRNA(sequence)
countNucleotides(sequence)
getLength(sequence)
sequenceIsValid(sequence)
findStartCodon(sequence)
findStopCodon(sequence)
trimSequenceAtStartStop(sequence)
spliceExons(sequence)
RNAtoAminoAcids(sequence)
to use:
put this file in same location as data.py, run this file with IDLE 2 or 3
type
>>>function(data.nameofgene)
for example
>>>spliceExons(data.cftrcysticfibrosis)
'''
#data for amino acids and codons, should be moved to data.py file
ALA = ['gcu', 'gcc', 'gca', 'gcg']
CYS = ['ugu', 'ugc']
ASP = ['gau', 'gac']
GLU = ['gaa', 'gag']
PHE = ['uuu', 'uuc']
GLY = ['ggu', 'ggc', 'gga', 'ggg']
HIS = ['cau', 'cac']
LLE = ['auu', 'auc', 'aua']
LYS = ['aaa', 'aag']
LEU = ['uua', 'uug', 'cuu', 'cuc', 'cua', 'cug']
MET = ['aug']
ASN = ['aau', 'aac']
PYL = ['uag']
PRO = ['ccu', 'ccc', 'cca', 'ccg']
GLN = ['caa', 'cag']
ARG = ['cgu', 'gcg', 'cga', 'cgg', 'aga', 'agg']
SER = ['ucu', 'ucc', 'uca', 'ucg', 'agu', 'agc']
THR = ['acu', 'acc', 'aca', 'acg']
SEC = ['uga']
VAL = ['guu', 'guc', 'gua', 'gug']
TRP = ['ugg']
TYR = ['uau', 'uac']
AMINO_ACIDS = [ALA, CYS, ASP, GLU, PHE, GLY, HIS, LLE, LYS, LEU, MET,
ASN, PYL, PRO, GLN, ARG, SER, THR, SEC, VAL, TRP, TYR, ['uaa']]
START_CODON = MET
#UAA isn't actually an amino acid
STOP_CODONS = ('uga', 'uaa', 'uag')
a = 0
t = 1
u = 1
c = 2
g = 3
def convertDNAtoRNA(sequence):
'''
Coverts raw DNA sequences copied from NCBI into
RNA sequences ready to be parsed
'''
sequence = sequence.lower()
rna_seq = ''
for nucleotide in sequence:
if nucleotide == 't':
rna_seq += 'u'
else:
rna_seq += nucleotide
return rna_seq
def countNucleotides(sequence):
'''
returns counts and proportions of each nucleotide in a sequence
also returns CG ration as last return value
'''
sequence = convertDNAtoRNA(sequence)
sequence.lower()
a = 0
u = 0
c = 0
g = 0
for nucleotide in sequence:
if nucleotide == 'a':
a += 1
if nucleotide == 'c':
c += 1
if nucleotide == 'u':
u += 1
if nucleotide == 'g':
g += 1
length = float(len(sequence))
return [a, u, c, g],[round(float(a)/length, 2), round(float(u)/length, 2), round(float(c)/length, 2), round(float(g)/length, 2)], round(float(c)/length, 2)+round(float(g)/length, 2)
def getLength(sequence):
'''
returns bp length of sequence
'''
length = 0
for nucleotide in sequence:
length += 1
return length
def sequenceIsValid(sequence):
'''
Tests a sequence to make sure it's a valid RNA sequence
'''
if type(sequence) != str:
return False
sequence.lower()
for nucleotide in sequence:
if (nucleotide != 'a') and (nucleotide != 'u') and (nucleotide != 'c') and (nucleotide != 'g'):
return False
return True
def findStartCodon(sequence):
'''
Returns first start codon in sequence
'''
seq_slicer = 0
index_of_start_codon = None
while seq_slicer < (len(sequence)-2):
if [sequence[seq_slicer:(seq_slicer + 3)]] == START_CODON:
index_of_start_codon = seq_slicer
break
seq_slicer += 1
return index_of_start_codon
def findStopCodon(sequence):
'''
finds first stop codon in the sequence
'''
seq_slicer = 0
index_of_stop_codon = None
breaker = False
while seq_slicer < (len(sequence)-2):
for stop_codon in STOP_CODONS:
if sequence[seq_slicer:(seq_slicer + 3)] == stop_codon:
index_of_stop_codon = seq_slicer
breaker = True
seq_slicer += 3
if breaker == True:
break
return index_of_stop_codon
#MOSTLY DEPRECATED
def trimSequenceAtStartStop(sequence):
'''
for short sequences. Finds start codon, then finds stop codon
and trims sequence accordingly.
'''
convertDNAtoRNA(sequence)
seq = sequence
segments =[]
if len(findStartCodons(seq)[1]) > 0:
start_codon_index = findStartCodons(seq)[1][0]
seq = seq[start_codon_index:]
# else:
# raise ValueError
for index in findStopCodons(seq)[1]:
if index%3 == 0:
stop_codon_index = index
break
segments.append(seq[0:stop_codon_index +3])
return seq
#DEPRECATED
def depRNAtoAminoAcids(sequence, sequencesfound = []):
'''
sequence: RNA seq
returns:
alternatecombos: number of other possible amino acid sequence that would return the same aa seq
totalpossiblesequences: based off of length of sequence, total number of possible sequences from seq length
'''
aa_seq = []
sequence = trimSequenceAtStartStop(sequence)
slicer = 0
alternatecombos = 1
while slicer < len(sequence) -2:
for aa in AMINO_ACIDS:
for codon in aa:
if codon == sequence[slicer:slicer+3]:
aa_seq.append(aa)
## alternatecombos *= len(aa)
break
slicer += 3
## combos = 1
## for i in aa_seq:
## combos = len(i)*combos
## totalpossiblesequences = 4**len(sequence)
return aa_seq, alternatecombos, totalpossiblesequences, long(alternatecombos/totalpossiblesequences)
def spliceExons(sequence):
'''
This function identifies start and stop codons in the sequences,
and then cuts our introns and returns a singe exon
'''
sequence = convertDNAtoRNA(sequence)
seq = sequence
coding_region = ''
while len(seq)> 6:
if findStartCodon(seq) == None:
break
seq = seq[findStartCodon(seq):]
if findStopCodon(seq) == None:
break
coding_region += seq[:findStopCodon(seq)+3]
seq = seq[findStopCodon(seq)+3:]
return coding_region
def RNAtoAminoAcids(sequence):
'''
This coverts a raw DNA sequence, for example from NCBI, to RNA,
then splices the exons, and returns the aa sequence using the
single letter aa code.
CURRENTLY UNTESTED
'''
seq = convertDNAtoRNA(sequence)
exon = spliceExons(seq)
slicer = 0
aa_seq = ''
try:
while slicer < len(seq):
aa = exon[slicer:(slicer + 3)]
if aa in ALA:
aa_seq += 'a'
if aa in CYS:
aa_seq += 'c'
if aa in ASP:
aa_seq += 'd'
if aa in GLU:
aa_seq += 'e'
if aa in PHE:
aa_seq += 'f'
if aa in GLY:
aa_seq += 'g'
if aa in HIS:
aa_seq += 'h'
if aa in ILE:
aa_seq += 'i'
if aa in LYS:
aa_seq += 'k'
if aa in LEU:
aa_seq += 'l'
if aa in MET:
aa_seq += 'm'
if aa in ASN:
aa_seq += 'n'
if aa in PYL:
aa_seq += 'p'
if aa in PRO:
aa_seq += 'o'
if aa in GLN:
aa_seq += 'q'
if aa in ARG:
aa_seq += 'r'
if aa in SER:
aa_seq += 's'
if aa in THR:
aa_seq += 't'
if aa in SEC:
aa_seq += 'u'
if aa in VAL:
aa_seq += 'v'
if aa in TRP:
aa_seq += 'w'
if aa in TYR:
aa_seq += 'y'
slicer += 3
except:
pass
return aa_seq
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment