snasphysicist/codon_amino_acid.py

## codon_amino_acid.py

import json
import Bio
from Bio import SwissProt

#Complementary base pairs
#Used when "inverting" sequence
base_pairs = {
                "A" : "T" ,
                "C" : "G" ,
                "G" : "C" ,
                "T" : "A"
             }


#File paths

#File containing mapping of codons to amino acids
#and amino acid letters in JSON format
mapping_file = "codons.json"

#File containing swissprot database
#converted to JSON format
swissprot_file = "/home/scott/Downloads/swissprotall.json"

#Load in the codon to amino acid JSON
with open( mapping_file , "r" ) as file :
    mapping = json.load( file )


#Function to convert "transcribe"
#a single codon to the amino acid code
#The argument is the codon
#If the codon does not match any known codon
#or if it is the wrong number of letter, etc...
#the function will return an empty string
def codon_to_amino_acid( codon ) :
    code = ""
    for amino_acid_name in mapping :
        amino_acid = mapping[ amino_acid_name ]
        if codon.upper() in amino_acid[ "codons" ] :
            code = amino_acid[ "code" ]
            break
    return code


#This function takes a string of bases
#and transcribes it into an amino acid code string
#Argument is a string fragment of DNA
def translate_dna( dna ) :
    amino_acid_sequence = ""
    i = 0
    while( i+2 < len( dna ) ) :
        #Look at three bases at a time and
        #transcripe them with the previous function
        amino_acid_sequence += codon_to_amino_acid( dna[ i:i+3 ] )
        i += 3
    return amino_acid_sequence


#Reverse a string, i.e. abcde -> edcba
#Done in a dumb way to avoid any possible
#object/reference issues
#Argument is the string to be reversed
def reverse_string( string_in ) :
    #Create new string
    string_out = ""
    #Add characters to the new string one at
    #a time, starting from the end of the argument
    #and working back towards the start
    for i in reversed(range(len(string_in))) :
        string_out += string_in[i]
    return string_out


#This generates a new dna string
#by swapping each base with its
#base pair complement
#Argument is a dna fragment as a string
def invert_sequence( dna ) :
    inverted = ""
    #For each base in the dna fragment given
    #add the complement to the string
    for base in dna.upper() :
        inverted += base_pairs[ base ]
    return inverted


#Function to find exact matches for a protein
#sequence from the swissprot database
#Argument 1 is the protein residue as a string of protein codes
#Argument 2 is a dictionary of swissprot records
def get_swissprot_matches( residue , records ) :
    matches = {}
    for key , value in records.items() :
        #Match with the sequence property from the record
        if residue in value[ "sequence" ] :
            matches[ key ] = value
    return matches


#List of dna fragments to check for matches
dna_fragments = [
                    "gaggagaagtctgccgttactgccctgtgg" ,
                    "tgaagagtggtacttcggaaagatcagtag" ,
                    "ccgtactacaactacgctggtgcattcaag" ,
                    "tttaccggtagcatttacttggtgcttggt" ,
                    "ttgacgtgagcgcgaaggtcggcttcgggc" ,
                    "tgggatatttcagatcaatagccgctactg"
                ]

print( "Translating fragments to residues" )

#We will store all fragments
#with their translations there
translations = {}

#For each input fragment
for fragment in dna_fragments :
    #Look first at the original fragment
    #Then at the fragment shifted by one
    #and then two bases
    for i in [ 0 , 1 , 2 ] :
        #In each case transcribe the original
        cut_fragment = fragment[i:].upper()
        translations[ cut_fragment ] = translate_dna( cut_fragment )
        #Then transcribe the sequence from the other helix
        cut_fragment = reverse_string( invert_sequence( fragment ) )[i:]
        translations[ cut_fragment ] = translate_dna( cut_fragment )

print( "Loading swissprot records" )

#Import the JSON swissprot records
with open( swissprot_file , "r" ) as file :
    swissprot_records = json.load( file )

print( "Loaded " + str(len(swissprot_records)) + " protein records" )
print( "Matching residues to swissprot sequences" )

#Path to the output file
results_path = "/home/scott/Documents/Biomakespace/EMBER/Part1/sequence_matching.txt"

#Open the file
results_file = open( results_path , "w" )

#For each entry in translations dictionary
for key , value in translations.items() :
    #Write out the dna & protein sequences
    results_file.write( "For fragment " + key + " residue " + value + "\n" )
    #Only look at sequences that don't keep a stop codon
    if "!" not in value :
        #Get the exact matches between the sequence
        #and the entries in swissprot
        hits = get_swissprot_matches( value , swissprot_records )
        #For each hit returned
        for key2 , value2 in hits.items() :
            #Write out the hit to the output file
            results_file.write( "  " + key2 + "\n  " + value2[ "sequence" ] + "\n\n" )
        #If there are no hits report that there are no matches
        if len( hits ) == 0 :
             results_file.write( "  No matches\n" )
    #If it does contain a stop codon, report this
    else :
        results_file.write( "  Contains STOP Codon\n" )
    results_file.write( "\n" )

#Close the output file
results_file.close()

	import json
	import Bio
	from Bio import SwissProt

	#Complementary base pairs
	#Used when "inverting" sequence
	base_pairs = {
	"A" : "T" ,
	"C" : "G" ,
	"G" : "C" ,
	"T" : "A"
	}


	#File paths

	#File containing mapping of codons to amino acids
	#and amino acid letters in JSON format
	mapping_file = "codons.json"

	#File containing swissprot database
	#converted to JSON format
	swissprot_file = "/home/scott/Downloads/swissprotall.json"

	#Load in the codon to amino acid JSON
	with open( mapping_file , "r" ) as file :
	mapping = json.load( file )



	#Function to convert "transcribe"
	#a single codon to the amino acid code
	#The argument is the codon
	#If the codon does not match any known codon
	#or if it is the wrong number of letter, etc...
	#the function will return an empty string
	def codon_to_amino_acid( codon ) :
	code = ""
	for amino_acid_name in mapping :
	amino_acid = mapping[ amino_acid_name ]
	if codon.upper() in amino_acid[ "codons" ] :
	code = amino_acid[ "code" ]
	break
	return code



	#This function takes a string of bases
	#and transcribes it into an amino acid code string
	#Argument is a string fragment of DNA
	def translate_dna( dna ) :
	amino_acid_sequence = ""
	i = 0
	while( i+2 < len( dna ) ) :
	#Look at three bases at a time and
	#transcripe them with the previous function
	amino_acid_sequence += codon_to_amino_acid( dna[ i:i+3 ] )
	i += 3
	return amino_acid_sequence



	#Reverse a string, i.e. abcde -> edcba
	#Done in a dumb way to avoid any possible
	#object/reference issues
	#Argument is the string to be reversed
	def reverse_string( string_in ) :
	#Create new string
	string_out = ""
	#Add characters to the new string one at
	#a time, starting from the end of the argument
	#and working back towards the start
	for i in reversed(range(len(string_in))) :
	string_out += string_in[i]
	return string_out



	#This generates a new dna string
	#by swapping each base with its
	#base pair complement
	#Argument is a dna fragment as a string
	def invert_sequence( dna ) :
	inverted = ""
	#For each base in the dna fragment given
	#add the complement to the string
	for base in dna.upper() :
	inverted += base_pairs[ base ]
	return inverted



	#Function to find exact matches for a protein
	#sequence from the swissprot database
	#Argument 1 is the protein residue as a string of protein codes
	#Argument 2 is a dictionary of swissprot records
	def get_swissprot_matches( residue , records ) :
	matches = {}
	for key , value in records.items() :
	#Match with the sequence property from the record
	if residue in value[ "sequence" ] :
	matches[ key ] = value
	return matches



	#List of dna fragments to check for matches
	dna_fragments = [
	"gaggagaagtctgccgttactgccctgtgg" ,
	"tgaagagtggtacttcggaaagatcagtag" ,
	"ccgtactacaactacgctggtgcattcaag" ,
	"tttaccggtagcatttacttggtgcttggt" ,
	"ttgacgtgagcgcgaaggtcggcttcgggc" ,
	"tgggatatttcagatcaatagccgctactg"
	]

	print( "Translating fragments to residues" )

	#We will store all fragments
	#with their translations there
	translations = {}

	#For each input fragment
	for fragment in dna_fragments :
	#Look first at the original fragment
	#Then at the fragment shifted by one
	#and then two bases
	for i in [ 0 , 1 , 2 ] :
	#In each case transcribe the original
	cut_fragment = fragment[i:].upper()
	translations[ cut_fragment ] = translate_dna( cut_fragment )
	#Then transcribe the sequence from the other helix
	cut_fragment = reverse_string( invert_sequence( fragment ) )[i:]
	translations[ cut_fragment ] = translate_dna( cut_fragment )

	print( "Loading swissprot records" )

	#Import the JSON swissprot records
	with open( swissprot_file , "r" ) as file :
	swissprot_records = json.load( file )

	print( "Loaded " + str(len(swissprot_records)) + " protein records" )
	print( "Matching residues to swissprot sequences" )

	#Path to the output file
	results_path = "/home/scott/Documents/Biomakespace/EMBER/Part1/sequence_matching.txt"

	#Open the file
	results_file = open( results_path , "w" )

	#For each entry in translations dictionary
	for key , value in translations.items() :
	#Write out the dna & protein sequences
	results_file.write( "For fragment " + key + " residue " + value + "\n" )
	#Only look at sequences that don't keep a stop codon
	if "!" not in value :
	#Get the exact matches between the sequence
	#and the entries in swissprot
	hits = get_swissprot_matches( value , swissprot_records )
	#For each hit returned
	for key2 , value2 in hits.items() :
	#Write out the hit to the output file
	results_file.write( " " + key2 + "\n " + value2[ "sequence" ] + "\n\n" )
	#If there are no hits report that there are no matches
	if len( hits ) == 0 :
	results_file.write( " No matches\n" )
	#If it does contain a stop codon, report this
	else :
	results_file.write( " Contains STOP Codon\n" )
	results_file.write( "\n" )

	#Close the output file
	results_file.close()