Created
June 5, 2018 18:50
-
-
Save snasphysicist/03d9fdfe7bcb0bf1bde09021c3662498 to your computer and use it in GitHub Desktop.
A python file for transcribing DNA fragments to protein sequences and searching swissprot for exact matches
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import Bio | |
from Bio import SwissProt | |
#Complementary base pairs | |
#Used when "inverting" sequence | |
base_pairs = { | |
"A" : "T" , | |
"C" : "G" , | |
"G" : "C" , | |
"T" : "A" | |
} | |
#File paths | |
#File containing mapping of codons to amino acids | |
#and amino acid letters in JSON format | |
mapping_file = "codons.json" | |
#File containing swissprot database | |
#converted to JSON format | |
swissprot_file = "/home/scott/Downloads/swissprotall.json" | |
#Load in the codon to amino acid JSON | |
with open( mapping_file , "r" ) as file : | |
mapping = json.load( file ) | |
#Function to convert "transcribe" | |
#a single codon to the amino acid code | |
#The argument is the codon | |
#If the codon does not match any known codon | |
#or if it is the wrong number of letter, etc... | |
#the function will return an empty string | |
def codon_to_amino_acid( codon ) : | |
code = "" | |
for amino_acid_name in mapping : | |
amino_acid = mapping[ amino_acid_name ] | |
if codon.upper() in amino_acid[ "codons" ] : | |
code = amino_acid[ "code" ] | |
break | |
return code | |
#This function takes a string of bases | |
#and transcribes it into an amino acid code string | |
#Argument is a string fragment of DNA | |
def translate_dna( dna ) : | |
amino_acid_sequence = "" | |
i = 0 | |
while( i+2 < len( dna ) ) : | |
#Look at three bases at a time and | |
#transcripe them with the previous function | |
amino_acid_sequence += codon_to_amino_acid( dna[ i:i+3 ] ) | |
i += 3 | |
return amino_acid_sequence | |
#Reverse a string, i.e. abcde -> edcba | |
#Done in a dumb way to avoid any possible | |
#object/reference issues | |
#Argument is the string to be reversed | |
def reverse_string( string_in ) : | |
#Create new string | |
string_out = "" | |
#Add characters to the new string one at | |
#a time, starting from the end of the argument | |
#and working back towards the start | |
for i in reversed(range(len(string_in))) : | |
string_out += string_in[i] | |
return string_out | |
#This generates a new dna string | |
#by swapping each base with its | |
#base pair complement | |
#Argument is a dna fragment as a string | |
def invert_sequence( dna ) : | |
inverted = "" | |
#For each base in the dna fragment given | |
#add the complement to the string | |
for base in dna.upper() : | |
inverted += base_pairs[ base ] | |
return inverted | |
#Function to find exact matches for a protein | |
#sequence from the swissprot database | |
#Argument 1 is the protein residue as a string of protein codes | |
#Argument 2 is a dictionary of swissprot records | |
def get_swissprot_matches( residue , records ) : | |
matches = {} | |
for key , value in records.items() : | |
#Match with the sequence property from the record | |
if residue in value[ "sequence" ] : | |
matches[ key ] = value | |
return matches | |
#List of dna fragments to check for matches | |
dna_fragments = [ | |
"gaggagaagtctgccgttactgccctgtgg" , | |
"tgaagagtggtacttcggaaagatcagtag" , | |
"ccgtactacaactacgctggtgcattcaag" , | |
"tttaccggtagcatttacttggtgcttggt" , | |
"ttgacgtgagcgcgaaggtcggcttcgggc" , | |
"tgggatatttcagatcaatagccgctactg" | |
] | |
print( "Translating fragments to residues" ) | |
#We will store all fragments | |
#with their translations there | |
translations = {} | |
#For each input fragment | |
for fragment in dna_fragments : | |
#Look first at the original fragment | |
#Then at the fragment shifted by one | |
#and then two bases | |
for i in [ 0 , 1 , 2 ] : | |
#In each case transcribe the original | |
cut_fragment = fragment[i:].upper() | |
translations[ cut_fragment ] = translate_dna( cut_fragment ) | |
#Then transcribe the sequence from the other helix | |
cut_fragment = reverse_string( invert_sequence( fragment ) )[i:] | |
translations[ cut_fragment ] = translate_dna( cut_fragment ) | |
print( "Loading swissprot records" ) | |
#Import the JSON swissprot records | |
with open( swissprot_file , "r" ) as file : | |
swissprot_records = json.load( file ) | |
print( "Loaded " + str(len(swissprot_records)) + " protein records" ) | |
print( "Matching residues to swissprot sequences" ) | |
#Path to the output file | |
results_path = "/home/scott/Documents/Biomakespace/EMBER/Part1/sequence_matching.txt" | |
#Open the file | |
results_file = open( results_path , "w" ) | |
#For each entry in translations dictionary | |
for key , value in translations.items() : | |
#Write out the dna & protein sequences | |
results_file.write( "For fragment " + key + " residue " + value + "\n" ) | |
#Only look at sequences that don't keep a stop codon | |
if "!" not in value : | |
#Get the exact matches between the sequence | |
#and the entries in swissprot | |
hits = get_swissprot_matches( value , swissprot_records ) | |
#For each hit returned | |
for key2 , value2 in hits.items() : | |
#Write out the hit to the output file | |
results_file.write( " " + key2 + "\n " + value2[ "sequence" ] + "\n\n" ) | |
#If there are no hits report that there are no matches | |
if len( hits ) == 0 : | |
results_file.write( " No matches\n" ) | |
#If it does contain a stop codon, report this | |
else : | |
results_file.write( " Contains STOP Codon\n" ) | |
results_file.write( "\n" ) | |
#Close the output file | |
results_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment