Skip to content

Instantly share code, notes, and snippets.

@matteoferla
Created January 30, 2016 19:03
Show Gist options
  • Save matteoferla/1429635b207ab3678b33 to your computer and use it in GitHub Desktop.
Save matteoferla/1429635b207ab3678b33 to your computer and use it in GitHub Desktop.
##THIS IS PUBLICALLY RELEASED TO DEMONSTRATE A POINT NOT AS A USABLE PIECE OF CODE, HENCE THE LACK OF ANNOTATION.
__doc__ = '''
I have peptide sequences, I need the DNA. plan:
Option A:
* find gi code
* look for gi code in the gbk file.
Option B:
* tblastn online
* download genome sequence
* extract sequence from nucleotide
Option C:
* degeneratively back translate
* blastn online
* get aligned reference
Option A won't work. C might be easy if degenerate sequence can be made
'''
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import Entrez
Entrez.email = "A.N.Other@example.com"
import re
#from https://en.wikipedia.org/wiki/Genetic_code
#if multiple version were present a super degerate was made...
def getDNA(id,start,end,strand):
gene = next(SeqIO.parse(Entrez.efetch(db="nucleotide", id=id, rettype="fasta", retmode="text"),"fasta"))
start =int(start)-1
end = int(end)
if strand[1]>0:
return gene.seq[start:end]
else:
return gene.seq[start:end].reverse_complement()
def prot2DNA(infile, outfile,fromWWW=True):
geneball=[]
if fromWWW:
print("blasting...")
fasta=open(infile,"r").read()
result_handle = NCBIWWW.qblast("tblastn", "nt", fasta)
else:
result_handle = open(infile,"r")
for rec in NCBIXML.parse(result_handle):
align=rec.alignments[0]
hsp=align.hsps[0]
rex=re.search("\|(.*?)\|",align.title)
if rex:
id=rex.group(1)
else:
id=align.title
native=SeqRecord(getDNA(id,int(hsp.sbjct_start),int(hsp.sbjct_end),hsp.frame),id=align.title+"("+str(hsp.sbjct_start)+":"+str(hsp.sbjct_end)+")")
a,b=len(native.seq.translate(to_stop=True)),hsp.query_end
if (a != b):
print("ERROR")
print("found ",a,"expect ",b)
print('sequence:', align.title)
print('length:', align.length)
print(hsp.query_start,hsp.query_end,align.length)
print("id... ",id)
print(native)
else:
geneball.append(native)
SeqIO.write(geneball,outfile,"fasta")
def doublecheck(fileDNA,fileProt):
def _checker(dexA,dexB):
issue=set(dexA.keys()).difference(set(dexB.keys()))
print(issue)
print(dexA[list(issue)[0]])
genedex={str(x.seq.translate(to_stop=True))[1:]: x for x in SeqIO.parse(fileDNA,"fasta")}
protdex={str(x.seq)[1:]: x for x in SeqIO.parse(fileProt,"fasta")}
_checker(genedex,protdex)
_checker(protdex,genedex)
def prot2DNA_degenerated(infile,outfile):
raise Exception("BLASTING WITH DEGENERATE SEQUENCE FAILS")
#This does not seem to work on the match side.
codontable = {
'A': 'GCT',
'C': 'TGY',
'D': 'GAY',
'E': 'GAR',
'F': 'TTY',
'G': 'GGN',
'I': 'ATH',
'H': 'CAY',
'K': 'AAR',
'L': 'YTN',
'M': 'ATG',
'N': 'AAY',
'P': 'CCN',
'Q': 'CAR',
'R': 'MGN',
'S': 'WSN',
'T': 'ACN',
'V': 'GTN',
'W': 'TGG',
'Y': 'TAY',
'*': 'TAR',
}
#make a degenerate set of sequences
genetxt=""
geneball=[]
for gene in SeqIO.parse(infile, "fasta"):
gene.seq=Seq("".join([codontable[residue] for residue in gene.seq]),IUPAC.ambiguous_dna)
print(gene.seq)
#genetxt+=gene.format("fasta")+"\n"
print("Blasting... if it does not change after a minute the NCBI servers must be overloaded")
result_handle = NCBIWWW.qblast("blastn", "nt", gene.format("fasta"))
rec = NCBIXML.read(result_handle)
align=rec.alignments[0]
hsp=align.hsps[0]
print('sequence:', align.title)
print('length:', align.length)
print(hsp.query_start,hsp.query_end,align.length)
if hsp.query_start == 1 and hsp.query_end == align.length:
native=SeqRecord(Seq(hsp.sbjct,IUPAC.unambiguous_dna),id=align.title+"("+hsp.sbjct_start+":"+hsp.sbjct_end+")")
geneball.append(native)
else:
print("Error")
SeqIO.write(geneball,outfile,"fasta")
if __name__ == "__main__":
#prot2DNA("protein.fa","DNA.fa",True)
prot2DNA("A1UGXFSZ014-Alignment.xml","DNA.fa",False)
doublecheck("DNA.fa","protein.fa")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment