matteoferla/protein2DNA.py

## protein2DNA.py
##THIS IS PUBLICALLY RELEASED TO DEMONSTRATE A POINT NOT AS A USABLE PIECE OF CODE, HENCE THE LACK OF ANNOTATION.
__doc__ = '''
I have peptide sequences, I need the DNA. plan:
Option A:
* find gi code
* look for gi code in the gbk file.
Option B:
* tblastn online
* download genome sequence
* extract sequence from nucleotide
Option C:
* degeneratively back translate
* blastn online
* get aligned reference
Option A won't work. C might be easy if degenerate sequence can be made
'''

from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import Entrez
Entrez.email = "A.N.Other@example.com"
import re

#from https://en.wikipedia.org/wiki/Genetic_code
#if multiple version were present a super degerate was made...

def getDNA(id,start,end,strand):
    gene = next(SeqIO.parse(Entrez.efetch(db="nucleotide", id=id, rettype="fasta", retmode="text"),"fasta"))
    start =int(start)-1
    end = int(end)
    if strand[1]>0:
        return gene.seq[start:end]
    else:
        return gene.seq[start:end].reverse_complement()


def prot2DNA(infile, outfile,fromWWW=True):
    geneball=[]
    if fromWWW:
        print("blasting...")
        fasta=open(infile,"r").read()
        result_handle = NCBIWWW.qblast("tblastn", "nt", fasta)
    else:
        result_handle = open(infile,"r")
    for rec in NCBIXML.parse(result_handle):
        align=rec.alignments[0]
        hsp=align.hsps[0]
        rex=re.search("\|(.*?)\|",align.title)
        if rex:
            id=rex.group(1)
        else:
            id=align.title

        native=SeqRecord(getDNA(id,int(hsp.sbjct_start),int(hsp.sbjct_end),hsp.frame),id=align.title+"("+str(hsp.sbjct_start)+":"+str(hsp.sbjct_end)+")")
        a,b=len(native.seq.translate(to_stop=True)),hsp.query_end
        if (a != b):
            print("ERROR")
            print("found ",a,"expect ",b)
            print('sequence:', align.title)
            print('length:', align.length)
            print(hsp.query_start,hsp.query_end,align.length)
            print("id... ",id)
            print(native)
        else:
            geneball.append(native)
    SeqIO.write(geneball,outfile,"fasta")

def doublecheck(fileDNA,fileProt):
    def _checker(dexA,dexB):
        issue=set(dexA.keys()).difference(set(dexB.keys()))
        print(issue)
        print(dexA[list(issue)[0]])
    genedex={str(x.seq.translate(to_stop=True))[1:]: x for x in SeqIO.parse(fileDNA,"fasta")}
    protdex={str(x.seq)[1:]: x for x in SeqIO.parse(fileProt,"fasta")}
    _checker(genedex,protdex)
    _checker(protdex,genedex)

def prot2DNA_degenerated(infile,outfile):
    raise Exception("BLASTING WITH DEGENERATE SEQUENCE FAILS")
    #This does not seem to work on the match side.
    codontable = {
    'A': 'GCT',
    'C': 'TGY',
    'D': 'GAY',
    'E': 'GAR',
    'F': 'TTY',
    'G': 'GGN',
    'I': 'ATH',
    'H': 'CAY',
    'K': 'AAR',
    'L': 'YTN',
    'M': 'ATG',
    'N': 'AAY',
    'P': 'CCN',
    'Q': 'CAR',
    'R': 'MGN',
    'S': 'WSN',
    'T': 'ACN',
    'V': 'GTN',
    'W': 'TGG',
    'Y': 'TAY',
    '*': 'TAR',
    }
    #make a degenerate set of sequences
    genetxt=""
    geneball=[]
    for gene in SeqIO.parse(infile, "fasta"):
        gene.seq=Seq("".join([codontable[residue] for residue in gene.seq]),IUPAC.ambiguous_dna)
        print(gene.seq)
        #genetxt+=gene.format("fasta")+"\n"
        print("Blasting... if it does not change after a minute the NCBI servers must be overloaded")
        result_handle = NCBIWWW.qblast("blastn", "nt", gene.format("fasta"))
        rec = NCBIXML.read(result_handle)
        align=rec.alignments[0]
        hsp=align.hsps[0]
        print('sequence:', align.title)
        print('length:', align.length)
        print(hsp.query_start,hsp.query_end,align.length)
        if hsp.query_start == 1 and hsp.query_end == align.length:
            native=SeqRecord(Seq(hsp.sbjct,IUPAC.unambiguous_dna),id=align.title+"("+hsp.sbjct_start+":"+hsp.sbjct_end+")")
            geneball.append(native)
        else:
            print("Error")
    SeqIO.write(geneball,outfile,"fasta")

if __name__ == "__main__":
    #prot2DNA("protein.fa","DNA.fa",True)
    prot2DNA("A1UGXFSZ014-Alignment.xml","DNA.fa",False)
    doublecheck("DNA.fa","protein.fa")
	##THIS IS PUBLICALLY RELEASED TO DEMONSTRATE A POINT NOT AS A USABLE PIECE OF CODE, HENCE THE LACK OF ANNOTATION.
	__doc__ = '''
	I have peptide sequences, I need the DNA. plan:
	Option A:
	* find gi code
	* look for gi code in the gbk file.
	Option B:
	* tblastn online
	* download genome sequence
	* extract sequence from nucleotide
	Option C:
	* degeneratively back translate
	* blastn online
	* get aligned reference
	Option A won't work. C might be easy if degenerate sequence can be made
	'''

	from Bio.Blast import NCBIWWW, NCBIXML
	from Bio import SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from Bio.Alphabet import IUPAC
	from Bio import Entrez
	Entrez.email = "A.N.Other@example.com"
	import re

	#from https://en.wikipedia.org/wiki/Genetic_code
	#if multiple version were present a super degerate was made...

	def getDNA(id,start,end,strand):
	gene = next(SeqIO.parse(Entrez.efetch(db="nucleotide", id=id, rettype="fasta", retmode="text"),"fasta"))
	start =int(start)-1
	end = int(end)
	if strand[1]>0:
	return gene.seq[start:end]
	else:
	return gene.seq[start:end].reverse_complement()


	def prot2DNA(infile, outfile,fromWWW=True):
	geneball=[]
	if fromWWW:
	print("blasting...")
	fasta=open(infile,"r").read()
	result_handle = NCBIWWW.qblast("tblastn", "nt", fasta)
	else:
	result_handle = open(infile,"r")
	for rec in NCBIXML.parse(result_handle):
	align=rec.alignments[0]
	hsp=align.hsps[0]
	rex=re.search("\\|(.*?)\\|",align.title)
	if rex:
	id=rex.group(1)
	else:
	id=align.title

	native=SeqRecord(getDNA(id,int(hsp.sbjct_start),int(hsp.sbjct_end),hsp.frame),id=align.title+"("+str(hsp.sbjct_start)+":"+str(hsp.sbjct_end)+")")
	a,b=len(native.seq.translate(to_stop=True)),hsp.query_end
	if (a != b):
	print("ERROR")
	print("found ",a,"expect ",b)
	print('sequence:', align.title)
	print('length:', align.length)
	print(hsp.query_start,hsp.query_end,align.length)
	print("id... ",id)
	print(native)
	else:
	geneball.append(native)
	SeqIO.write(geneball,outfile,"fasta")

	def doublecheck(fileDNA,fileProt):
	def _checker(dexA,dexB):
	issue=set(dexA.keys()).difference(set(dexB.keys()))
	print(issue)
	print(dexA[list(issue)[0]])
	genedex={str(x.seq.translate(to_stop=True))[1:]: x for x in SeqIO.parse(fileDNA,"fasta")}
	protdex={str(x.seq)[1:]: x for x in SeqIO.parse(fileProt,"fasta")}
	_checker(genedex,protdex)
	_checker(protdex,genedex)

	def prot2DNA_degenerated(infile,outfile):
	raise Exception("BLASTING WITH DEGENERATE SEQUENCE FAILS")
	#This does not seem to work on the match side.
	codontable = {
	'A': 'GCT',
	'C': 'TGY',
	'D': 'GAY',
	'E': 'GAR',
	'F': 'TTY',
	'G': 'GGN',
	'I': 'ATH',
	'H': 'CAY',
	'K': 'AAR',
	'L': 'YTN',
	'M': 'ATG',
	'N': 'AAY',
	'P': 'CCN',
	'Q': 'CAR',
	'R': 'MGN',
	'S': 'WSN',
	'T': 'ACN',
	'V': 'GTN',
	'W': 'TGG',
	'Y': 'TAY',
	'*': 'TAR',
	}
	#make a degenerate set of sequences
	genetxt=""
	geneball=[]
	for gene in SeqIO.parse(infile, "fasta"):
	gene.seq=Seq("".join([codontable[residue] for residue in gene.seq]),IUPAC.ambiguous_dna)
	print(gene.seq)
	#genetxt+=gene.format("fasta")+"\n"
	print("Blasting... if it does not change after a minute the NCBI servers must be overloaded")
	result_handle = NCBIWWW.qblast("blastn", "nt", gene.format("fasta"))
	rec = NCBIXML.read(result_handle)
	align=rec.alignments[0]
	hsp=align.hsps[0]
	print('sequence:', align.title)
	print('length:', align.length)
	print(hsp.query_start,hsp.query_end,align.length)
	if hsp.query_start == 1 and hsp.query_end == align.length:
	native=SeqRecord(Seq(hsp.sbjct,IUPAC.unambiguous_dna),id=align.title+"("+hsp.sbjct_start+":"+hsp.sbjct_end+")")
	geneball.append(native)
	else:
	print("Error")
	SeqIO.write(geneball,outfile,"fasta")

	if __name__ == "__main__":
	#prot2DNA("protein.fa","DNA.fa",True)
	prot2DNA("A1UGXFSZ014-Alignment.xml","DNA.fa",False)
	doublecheck("DNA.fa","protein.fa")