Created
January 30, 2016 19:03
-
-
Save matteoferla/1429635b207ab3678b33 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##THIS IS PUBLICALLY RELEASED TO DEMONSTRATE A POINT NOT AS A USABLE PIECE OF CODE, HENCE THE LACK OF ANNOTATION. | |
__doc__ = ''' | |
I have peptide sequences, I need the DNA. plan: | |
Option A: | |
* find gi code | |
* look for gi code in the gbk file. | |
Option B: | |
* tblastn online | |
* download genome sequence | |
* extract sequence from nucleotide | |
Option C: | |
* degeneratively back translate | |
* blastn online | |
* get aligned reference | |
Option A won't work. C might be easy if degenerate sequence can be made | |
''' | |
from Bio.Blast import NCBIWWW, NCBIXML | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from Bio.Alphabet import IUPAC | |
from Bio import Entrez | |
Entrez.email = "A.N.Other@example.com" | |
import re | |
#from https://en.wikipedia.org/wiki/Genetic_code | |
#if multiple version were present a super degerate was made... | |
def getDNA(id,start,end,strand): | |
gene = next(SeqIO.parse(Entrez.efetch(db="nucleotide", id=id, rettype="fasta", retmode="text"),"fasta")) | |
start =int(start)-1 | |
end = int(end) | |
if strand[1]>0: | |
return gene.seq[start:end] | |
else: | |
return gene.seq[start:end].reverse_complement() | |
def prot2DNA(infile, outfile,fromWWW=True): | |
geneball=[] | |
if fromWWW: | |
print("blasting...") | |
fasta=open(infile,"r").read() | |
result_handle = NCBIWWW.qblast("tblastn", "nt", fasta) | |
else: | |
result_handle = open(infile,"r") | |
for rec in NCBIXML.parse(result_handle): | |
align=rec.alignments[0] | |
hsp=align.hsps[0] | |
rex=re.search("\|(.*?)\|",align.title) | |
if rex: | |
id=rex.group(1) | |
else: | |
id=align.title | |
native=SeqRecord(getDNA(id,int(hsp.sbjct_start),int(hsp.sbjct_end),hsp.frame),id=align.title+"("+str(hsp.sbjct_start)+":"+str(hsp.sbjct_end)+")") | |
a,b=len(native.seq.translate(to_stop=True)),hsp.query_end | |
if (a != b): | |
print("ERROR") | |
print("found ",a,"expect ",b) | |
print('sequence:', align.title) | |
print('length:', align.length) | |
print(hsp.query_start,hsp.query_end,align.length) | |
print("id... ",id) | |
print(native) | |
else: | |
geneball.append(native) | |
SeqIO.write(geneball,outfile,"fasta") | |
def doublecheck(fileDNA,fileProt): | |
def _checker(dexA,dexB): | |
issue=set(dexA.keys()).difference(set(dexB.keys())) | |
print(issue) | |
print(dexA[list(issue)[0]]) | |
genedex={str(x.seq.translate(to_stop=True))[1:]: x for x in SeqIO.parse(fileDNA,"fasta")} | |
protdex={str(x.seq)[1:]: x for x in SeqIO.parse(fileProt,"fasta")} | |
_checker(genedex,protdex) | |
_checker(protdex,genedex) | |
def prot2DNA_degenerated(infile,outfile): | |
raise Exception("BLASTING WITH DEGENERATE SEQUENCE FAILS") | |
#This does not seem to work on the match side. | |
codontable = { | |
'A': 'GCT', | |
'C': 'TGY', | |
'D': 'GAY', | |
'E': 'GAR', | |
'F': 'TTY', | |
'G': 'GGN', | |
'I': 'ATH', | |
'H': 'CAY', | |
'K': 'AAR', | |
'L': 'YTN', | |
'M': 'ATG', | |
'N': 'AAY', | |
'P': 'CCN', | |
'Q': 'CAR', | |
'R': 'MGN', | |
'S': 'WSN', | |
'T': 'ACN', | |
'V': 'GTN', | |
'W': 'TGG', | |
'Y': 'TAY', | |
'*': 'TAR', | |
} | |
#make a degenerate set of sequences | |
genetxt="" | |
geneball=[] | |
for gene in SeqIO.parse(infile, "fasta"): | |
gene.seq=Seq("".join([codontable[residue] for residue in gene.seq]),IUPAC.ambiguous_dna) | |
print(gene.seq) | |
#genetxt+=gene.format("fasta")+"\n" | |
print("Blasting... if it does not change after a minute the NCBI servers must be overloaded") | |
result_handle = NCBIWWW.qblast("blastn", "nt", gene.format("fasta")) | |
rec = NCBIXML.read(result_handle) | |
align=rec.alignments[0] | |
hsp=align.hsps[0] | |
print('sequence:', align.title) | |
print('length:', align.length) | |
print(hsp.query_start,hsp.query_end,align.length) | |
if hsp.query_start == 1 and hsp.query_end == align.length: | |
native=SeqRecord(Seq(hsp.sbjct,IUPAC.unambiguous_dna),id=align.title+"("+hsp.sbjct_start+":"+hsp.sbjct_end+")") | |
geneball.append(native) | |
else: | |
print("Error") | |
SeqIO.write(geneball,outfile,"fasta") | |
if __name__ == "__main__": | |
#prot2DNA("protein.fa","DNA.fa",True) | |
prot2DNA("A1UGXFSZ014-Alignment.xml","DNA.fa",False) | |
doublecheck("DNA.fa","protein.fa") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment