mandyRae/bioinfo.py

## bioinfo.py
import data
#sequence = 'cgauggccuaaguuaaagcauccaagcguagauaauagugga'

'''
Functions defined here:

convertDNAtoRNA(sequence)
countNucleotides(sequence)
getLength(sequence)
sequenceIsValid(sequence)
findStartCodon(sequence)
findStopCodon(sequence)
trimSequenceAtStartStop(sequence)
spliceExons(sequence)
RNAtoAminoAcids(sequence)

to use:

put this file in same location as data.py, run this file with IDLE 2 or 3

type
>>>function(data.nameofgene)
for example
>>>spliceExons(data.cftrcysticfibrosis)

'''

#data for amino acids and codons, should be moved to data.py file
ALA = ['gcu', 'gcc', 'gca', 'gcg']
CYS = ['ugu', 'ugc']
ASP = ['gau', 'gac']
GLU = ['gaa', 'gag']
PHE = ['uuu', 'uuc']
GLY = ['ggu', 'ggc', 'gga', 'ggg']
HIS = ['cau', 'cac']
LLE = ['auu', 'auc', 'aua']
LYS = ['aaa', 'aag']
LEU = ['uua', 'uug', 'cuu', 'cuc', 'cua', 'cug']
MET = ['aug']
ASN = ['aau', 'aac']
PYL = ['uag']
PRO = ['ccu', 'ccc', 'cca', 'ccg']
GLN = ['caa', 'cag']
ARG = ['cgu', 'gcg', 'cga', 'cgg', 'aga', 'agg']
SER = ['ucu', 'ucc', 'uca', 'ucg', 'agu', 'agc']
THR = ['acu', 'acc', 'aca', 'acg']
SEC = ['uga']
VAL = ['guu', 'guc', 'gua', 'gug']
TRP = ['ugg']
TYR = ['uau', 'uac']
AMINO_ACIDS = [ALA, CYS, ASP, GLU, PHE, GLY, HIS, LLE, LYS, LEU, MET,
               ASN, PYL, PRO, GLN, ARG, SER, THR, SEC, VAL, TRP, TYR, ['uaa']]
START_CODON = MET

#UAA isn't actually an amino acid
STOP_CODONS = ('uga', 'uaa', 'uag')

a = 0
t = 1
u = 1
c = 2
g = 3

def convertDNAtoRNA(sequence):
    '''
    Coverts raw DNA sequences copied from NCBI into
    RNA sequences ready to be parsed
    '''
    sequence = sequence.lower()

    rna_seq = ''

    for nucleotide in sequence:

        if nucleotide == 't':
            rna_seq += 'u'
        else:
            rna_seq += nucleotide

    return rna_seq

def countNucleotides(sequence):
    '''
    returns counts and proportions of each nucleotide in a sequence
    also returns CG ration as last return value
    '''
    sequence = convertDNAtoRNA(sequence)
    sequence.lower()

    a = 0
    u = 0
    c = 0
    g = 0

    for nucleotide in sequence:
        if nucleotide == 'a':
            a += 1
        if nucleotide == 'c':
            c += 1
        if nucleotide == 'u':
            u += 1
        if nucleotide == 'g':
            g += 1
    length = float(len(sequence))
    return [a, u, c, g],[round(float(a)/length, 2), round(float(u)/length, 2), round(float(c)/length, 2), round(float(g)/length, 2)], round(float(c)/length, 2)+round(float(g)/length, 2)

def getLength(sequence):
    '''
    returns bp length of sequence
    '''
    length = 0
    for nucleotide in sequence:
        length += 1
    return length

def sequenceIsValid(sequence):
    '''
    Tests a sequence to make sure it's a valid RNA sequence
    '''
    if type(sequence) != str:
        return False

    sequence.lower()

    for nucleotide in sequence:
        if (nucleotide != 'a') and (nucleotide != 'u') and (nucleotide != 'c') and (nucleotide != 'g'):
            return False

    return True

def findStartCodon(sequence):
    '''
    Returns first start codon in sequence
    '''
    seq_slicer = 0
    index_of_start_codon = None
    while seq_slicer < (len(sequence)-2):
        if [sequence[seq_slicer:(seq_slicer + 3)]] == START_CODON:
            index_of_start_codon = seq_slicer
            break
        seq_slicer += 1

    return index_of_start_codon

def findStopCodon(sequence):
    '''
    finds first stop codon in the sequence
    '''
    seq_slicer = 0
    index_of_stop_codon = None
    breaker = False

    while seq_slicer < (len(sequence)-2):

        for stop_codon in STOP_CODONS:
            if sequence[seq_slicer:(seq_slicer + 3)] == stop_codon:
                index_of_stop_codon = seq_slicer
                breaker = True
        seq_slicer += 3
        if breaker == True:
            break

    return index_of_stop_codon

#MOSTLY DEPRECATED
def trimSequenceAtStartStop(sequence):
    '''
    for short sequences. Finds start codon, then finds stop codon
    and trims sequence accordingly.
    '''
    convertDNAtoRNA(sequence)
    seq = sequence
    segments =[]

    if len(findStartCodons(seq)[1]) > 0:
        start_codon_index = findStartCodons(seq)[1][0]
        seq = seq[start_codon_index:]
#    else:
#        raise ValueError

    for index in findStopCodons(seq)[1]:
        if index%3 == 0:
            stop_codon_index = index
            break

    segments.append(seq[0:stop_codon_index +3])

    return seq

#DEPRECATED
def depRNAtoAminoAcids(sequence, sequencesfound = []):
    '''
    sequence: RNA seq

    returns:
    alternatecombos: number of other possible amino acid sequence that would return the same aa seq
    totalpossiblesequences: based off of length of sequence, total number of possible sequences from seq length
    '''

    aa_seq = []
    sequence = trimSequenceAtStartStop(sequence)
    slicer = 0
    alternatecombos = 1
    while slicer < len(sequence) -2:
        for aa in AMINO_ACIDS:
            for codon in aa:
                if codon == sequence[slicer:slicer+3]:
                    aa_seq.append(aa)
##                    alternatecombos *= len(aa)
                    break
        slicer += 3
##    combos = 1
##    for i in aa_seq:
##        combos = len(i)*combos
##    totalpossiblesequences = 4**len(sequence)
    return aa_seq, alternatecombos, totalpossiblesequences, long(alternatecombos/totalpossiblesequences)

def spliceExons(sequence):
    '''
    This function identifies start and stop codons in the sequences,
    and then cuts our introns and returns a singe exon
    '''
    sequence = convertDNAtoRNA(sequence)
    seq = sequence

    coding_region = ''

    while len(seq)> 6:
        if findStartCodon(seq) == None:
            break
        seq = seq[findStartCodon(seq):]
        if findStopCodon(seq) == None:
            break
        coding_region += seq[:findStopCodon(seq)+3]
        seq = seq[findStopCodon(seq)+3:]
    return coding_region

def RNAtoAminoAcids(sequence):
    '''
    This coverts a raw DNA sequence, for example from NCBI, to RNA,
    then splices the exons, and returns the aa sequence using the
    single letter aa code.
    CURRENTLY UNTESTED
    '''
    seq = convertDNAtoRNA(sequence)
    exon = spliceExons(seq)

    slicer = 0
    aa_seq = ''

    try:

        while slicer < len(seq):
            aa = exon[slicer:(slicer + 3)]

            if aa in ALA:
                aa_seq += 'a'
            if aa in CYS:
                aa_seq += 'c'
            if aa in ASP:
                aa_seq += 'd'
            if aa in GLU:
                aa_seq += 'e'
            if aa in PHE:
                aa_seq += 'f'
            if aa in GLY:
                aa_seq += 'g'
            if aa in HIS:
                aa_seq += 'h'
            if aa in ILE:
                aa_seq += 'i'
            if aa in LYS:
                aa_seq += 'k'
            if aa in LEU:
                aa_seq += 'l'
            if aa in MET:
                aa_seq += 'm'
            if aa in ASN:
                aa_seq += 'n'
            if aa in PYL:
                aa_seq += 'p'
            if aa in PRO:
                aa_seq += 'o'
            if aa in GLN:
                aa_seq += 'q'
            if aa in ARG:
                aa_seq += 'r'
            if aa in SER:
                aa_seq += 's'
            if aa in THR:
                aa_seq += 't'
            if aa in SEC:
                aa_seq += 'u'
            if aa in VAL:
                aa_seq += 'v'
            if aa in TRP:
                aa_seq += 'w'
            if aa in TYR:
                aa_seq += 'y'
            slicer += 3
    except:
        pass
    return aa_seq
	import data
	#sequence = 'cgauggccuaaguuaaagcauccaagcguagauaauagugga'

	'''
	Functions defined here:

	convertDNAtoRNA(sequence)
	countNucleotides(sequence)
	getLength(sequence)
	sequenceIsValid(sequence)
	findStartCodon(sequence)
	findStopCodon(sequence)
	trimSequenceAtStartStop(sequence)
	spliceExons(sequence)
	RNAtoAminoAcids(sequence)

	to use:

	put this file in same location as data.py, run this file with IDLE 2 or 3

	type
	>>>function(data.nameofgene)
	for example
	>>>spliceExons(data.cftrcysticfibrosis)

	'''

	#data for amino acids and codons, should be moved to data.py file
	ALA = ['gcu', 'gcc', 'gca', 'gcg']
	CYS = ['ugu', 'ugc']
	ASP = ['gau', 'gac']
	GLU = ['gaa', 'gag']
	PHE = ['uuu', 'uuc']
	GLY = ['ggu', 'ggc', 'gga', 'ggg']
	HIS = ['cau', 'cac']
	LLE = ['auu', 'auc', 'aua']
	LYS = ['aaa', 'aag']
	LEU = ['uua', 'uug', 'cuu', 'cuc', 'cua', 'cug']
	MET = ['aug']
	ASN = ['aau', 'aac']
	PYL = ['uag']
	PRO = ['ccu', 'ccc', 'cca', 'ccg']
	GLN = ['caa', 'cag']
	ARG = ['cgu', 'gcg', 'cga', 'cgg', 'aga', 'agg']
	SER = ['ucu', 'ucc', 'uca', 'ucg', 'agu', 'agc']
	THR = ['acu', 'acc', 'aca', 'acg']
	SEC = ['uga']
	VAL = ['guu', 'guc', 'gua', 'gug']
	TRP = ['ugg']
	TYR = ['uau', 'uac']
	AMINO_ACIDS = [ALA, CYS, ASP, GLU, PHE, GLY, HIS, LLE, LYS, LEU, MET,
	ASN, PYL, PRO, GLN, ARG, SER, THR, SEC, VAL, TRP, TYR, ['uaa']]
	START_CODON = MET

	#UAA isn't actually an amino acid
	STOP_CODONS = ('uga', 'uaa', 'uag')

	a = 0
	t = 1
	u = 1
	c = 2
	g = 3

	def convertDNAtoRNA(sequence):
	'''
	Coverts raw DNA sequences copied from NCBI into
	RNA sequences ready to be parsed
	'''
	sequence = sequence.lower()

	rna_seq = ''

	for nucleotide in sequence:

	if nucleotide == 't':
	rna_seq += 'u'
	else:
	rna_seq += nucleotide

	return rna_seq

	def countNucleotides(sequence):
	'''
	returns counts and proportions of each nucleotide in a sequence
	also returns CG ration as last return value
	'''
	sequence = convertDNAtoRNA(sequence)
	sequence.lower()

	a = 0
	u = 0
	c = 0
	g = 0

	for nucleotide in sequence:
	if nucleotide == 'a':
	a += 1
	if nucleotide == 'c':
	c += 1
	if nucleotide == 'u':
	u += 1
	if nucleotide == 'g':
	g += 1
	length = float(len(sequence))
	return [a, u, c, g],[round(float(a)/length, 2), round(float(u)/length, 2), round(float(c)/length, 2), round(float(g)/length, 2)], round(float(c)/length, 2)+round(float(g)/length, 2)

	def getLength(sequence):
	'''
	returns bp length of sequence
	'''
	length = 0
	for nucleotide in sequence:
	length += 1
	return length

	def sequenceIsValid(sequence):
	'''
	Tests a sequence to make sure it's a valid RNA sequence
	'''
	if type(sequence) != str:
	return False

	sequence.lower()

	for nucleotide in sequence:
	if (nucleotide != 'a') and (nucleotide != 'u') and (nucleotide != 'c') and (nucleotide != 'g'):
	return False

	return True

	def findStartCodon(sequence):
	'''
	Returns first start codon in sequence
	'''
	seq_slicer = 0
	index_of_start_codon = None
	while seq_slicer < (len(sequence)-2):
	if [sequence[seq_slicer:(seq_slicer + 3)]] == START_CODON:
	index_of_start_codon = seq_slicer
	break
	seq_slicer += 1

	return index_of_start_codon

	def findStopCodon(sequence):
	'''
	finds first stop codon in the sequence
	'''
	seq_slicer = 0
	index_of_stop_codon = None
	breaker = False

	while seq_slicer < (len(sequence)-2):

	for stop_codon in STOP_CODONS:
	if sequence[seq_slicer:(seq_slicer + 3)] == stop_codon:
	index_of_stop_codon = seq_slicer
	breaker = True
	seq_slicer += 3
	if breaker == True:
	break

	return index_of_stop_codon

	#MOSTLY DEPRECATED
	def trimSequenceAtStartStop(sequence):
	'''
	for short sequences. Finds start codon, then finds stop codon
	and trims sequence accordingly.
	'''
	convertDNAtoRNA(sequence)
	seq = sequence
	segments =[]

	if len(findStartCodons(seq)[1]) > 0:
	start_codon_index = findStartCodons(seq)[1][0]
	seq = seq[start_codon_index:]
	# else:
	# raise ValueError

	for index in findStopCodons(seq)[1]:
	if index%3 == 0:
	stop_codon_index = index
	break

	segments.append(seq[0:stop_codon_index +3])

	return seq

	#DEPRECATED
	def depRNAtoAminoAcids(sequence, sequencesfound = []):
	'''
	sequence: RNA seq

	returns:
	alternatecombos: number of other possible amino acid sequence that would return the same aa seq
	totalpossiblesequences: based off of length of sequence, total number of possible sequences from seq length
	'''

	aa_seq = []
	sequence = trimSequenceAtStartStop(sequence)
	slicer = 0
	alternatecombos = 1
	while slicer < len(sequence) -2:
	for aa in AMINO_ACIDS:
	for codon in aa:
	if codon == sequence[slicer:slicer+3]:
	aa_seq.append(aa)
	## alternatecombos *= len(aa)
	break
	slicer += 3
	## combos = 1
	## for i in aa_seq:
	## combos = len(i)*combos
	## totalpossiblesequences = 4**len(sequence)
	return aa_seq, alternatecombos, totalpossiblesequences, long(alternatecombos/totalpossiblesequences)

	def spliceExons(sequence):
	'''
	This function identifies start and stop codons in the sequences,
	and then cuts our introns and returns a singe exon
	'''
	sequence = convertDNAtoRNA(sequence)
	seq = sequence

	coding_region = ''

	while len(seq)> 6:
	if findStartCodon(seq) == None:
	break
	seq = seq[findStartCodon(seq):]
	if findStopCodon(seq) == None:
	break
	coding_region += seq[:findStopCodon(seq)+3]
	seq = seq[findStopCodon(seq)+3:]
	return coding_region

	def RNAtoAminoAcids(sequence):
	'''
	This coverts a raw DNA sequence, for example from NCBI, to RNA,
	then splices the exons, and returns the aa sequence using the
	single letter aa code.
	CURRENTLY UNTESTED
	'''
	seq = convertDNAtoRNA(sequence)
	exon = spliceExons(seq)

	slicer = 0
	aa_seq = ''

	try:

	while slicer < len(seq):
	aa = exon[slicer:(slicer + 3)]

	if aa in ALA:
	aa_seq += 'a'
	if aa in CYS:
	aa_seq += 'c'
	if aa in ASP:
	aa_seq += 'd'
	if aa in GLU:
	aa_seq += 'e'
	if aa in PHE:
	aa_seq += 'f'
	if aa in GLY:
	aa_seq += 'g'
	if aa in HIS:
	aa_seq += 'h'
	if aa in ILE:
	aa_seq += 'i'
	if aa in LYS:
	aa_seq += 'k'
	if aa in LEU:
	aa_seq += 'l'
	if aa in MET:
	aa_seq += 'm'
	if aa in ASN:
	aa_seq += 'n'
	if aa in PYL:
	aa_seq += 'p'
	if aa in PRO:
	aa_seq += 'o'
	if aa in GLN:
	aa_seq += 'q'
	if aa in ARG:
	aa_seq += 'r'
	if aa in SER:
	aa_seq += 's'
	if aa in THR:
	aa_seq += 't'
	if aa in SEC:
	aa_seq += 'u'
	if aa in VAL:
	aa_seq += 'v'
	if aa in TRP:
	aa_seq += 'w'
	if aa in TYR:
	aa_seq += 'y'
	slicer += 3
	except:
	pass
	return aa_seq