Skip to content

Instantly share code, notes, and snippets.

@snasphysicist
Created June 5, 2018 18:50
Show Gist options
  • Save snasphysicist/03d9fdfe7bcb0bf1bde09021c3662498 to your computer and use it in GitHub Desktop.
Save snasphysicist/03d9fdfe7bcb0bf1bde09021c3662498 to your computer and use it in GitHub Desktop.
A python file for transcribing DNA fragments to protein sequences and searching swissprot for exact matches
import json
import Bio
from Bio import SwissProt
#Complementary base pairs
#Used when "inverting" sequence
base_pairs = {
"A" : "T" ,
"C" : "G" ,
"G" : "C" ,
"T" : "A"
}
#File paths
#File containing mapping of codons to amino acids
#and amino acid letters in JSON format
mapping_file = "codons.json"
#File containing swissprot database
#converted to JSON format
swissprot_file = "/home/scott/Downloads/swissprotall.json"
#Load in the codon to amino acid JSON
with open( mapping_file , "r" ) as file :
mapping = json.load( file )
#Function to convert "transcribe"
#a single codon to the amino acid code
#The argument is the codon
#If the codon does not match any known codon
#or if it is the wrong number of letter, etc...
#the function will return an empty string
def codon_to_amino_acid( codon ) :
code = ""
for amino_acid_name in mapping :
amino_acid = mapping[ amino_acid_name ]
if codon.upper() in amino_acid[ "codons" ] :
code = amino_acid[ "code" ]
break
return code
#This function takes a string of bases
#and transcribes it into an amino acid code string
#Argument is a string fragment of DNA
def translate_dna( dna ) :
amino_acid_sequence = ""
i = 0
while( i+2 < len( dna ) ) :
#Look at three bases at a time and
#transcripe them with the previous function
amino_acid_sequence += codon_to_amino_acid( dna[ i:i+3 ] )
i += 3
return amino_acid_sequence
#Reverse a string, i.e. abcde -> edcba
#Done in a dumb way to avoid any possible
#object/reference issues
#Argument is the string to be reversed
def reverse_string( string_in ) :
#Create new string
string_out = ""
#Add characters to the new string one at
#a time, starting from the end of the argument
#and working back towards the start
for i in reversed(range(len(string_in))) :
string_out += string_in[i]
return string_out
#This generates a new dna string
#by swapping each base with its
#base pair complement
#Argument is a dna fragment as a string
def invert_sequence( dna ) :
inverted = ""
#For each base in the dna fragment given
#add the complement to the string
for base in dna.upper() :
inverted += base_pairs[ base ]
return inverted
#Function to find exact matches for a protein
#sequence from the swissprot database
#Argument 1 is the protein residue as a string of protein codes
#Argument 2 is a dictionary of swissprot records
def get_swissprot_matches( residue , records ) :
matches = {}
for key , value in records.items() :
#Match with the sequence property from the record
if residue in value[ "sequence" ] :
matches[ key ] = value
return matches
#List of dna fragments to check for matches
dna_fragments = [
"gaggagaagtctgccgttactgccctgtgg" ,
"tgaagagtggtacttcggaaagatcagtag" ,
"ccgtactacaactacgctggtgcattcaag" ,
"tttaccggtagcatttacttggtgcttggt" ,
"ttgacgtgagcgcgaaggtcggcttcgggc" ,
"tgggatatttcagatcaatagccgctactg"
]
print( "Translating fragments to residues" )
#We will store all fragments
#with their translations there
translations = {}
#For each input fragment
for fragment in dna_fragments :
#Look first at the original fragment
#Then at the fragment shifted by one
#and then two bases
for i in [ 0 , 1 , 2 ] :
#In each case transcribe the original
cut_fragment = fragment[i:].upper()
translations[ cut_fragment ] = translate_dna( cut_fragment )
#Then transcribe the sequence from the other helix
cut_fragment = reverse_string( invert_sequence( fragment ) )[i:]
translations[ cut_fragment ] = translate_dna( cut_fragment )
print( "Loading swissprot records" )
#Import the JSON swissprot records
with open( swissprot_file , "r" ) as file :
swissprot_records = json.load( file )
print( "Loaded " + str(len(swissprot_records)) + " protein records" )
print( "Matching residues to swissprot sequences" )
#Path to the output file
results_path = "/home/scott/Documents/Biomakespace/EMBER/Part1/sequence_matching.txt"
#Open the file
results_file = open( results_path , "w" )
#For each entry in translations dictionary
for key , value in translations.items() :
#Write out the dna & protein sequences
results_file.write( "For fragment " + key + " residue " + value + "\n" )
#Only look at sequences that don't keep a stop codon
if "!" not in value :
#Get the exact matches between the sequence
#and the entries in swissprot
hits = get_swissprot_matches( value , swissprot_records )
#For each hit returned
for key2 , value2 in hits.items() :
#Write out the hit to the output file
results_file.write( " " + key2 + "\n " + value2[ "sequence" ] + "\n\n" )
#If there are no hits report that there are no matches
if len( hits ) == 0 :
results_file.write( " No matches\n" )
#If it does contain a stop codon, report this
else :
results_file.write( " Contains STOP Codon\n" )
results_file.write( "\n" )
#Close the output file
results_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment