Skip to content

Instantly share code, notes, and snippets.

@bryan-lunt
Created June 30, 2010 18:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bryan-lunt/459005 to your computer and use it in GitHub Desktop.
Save bryan-lunt/459005 to your computer and use it in GitHub Desktop.
'''
Created on Jun 29, 2010
@author: lunt@ctbp.ucsd.edu
'''
from Bio.Seq import Seq
import Bio.PDB as PDB
import Bio.PDB.Polypeptide as PP
from itertools import izip, count
def get_seqres_from_structfile(structfile):
"""Expects an opened handle to the pdbXXXX.ent file to be read
Returns a dict of ChainID -> Bio.Seq.Seq objects representing the seqres entries of the pdb file.
"""
lines = list()
#get all SEQRES entries
for line in structfile:
if line.startswith("SEQRES"):
lines.append(line)
threeDict = dict()
for line in lines:
chainID = line[11]
ress = line[19:]
resList = threeDict.setdefault(chainID,list())
resList.extend(ress.split())
#TODO Infer for each sequences what its alphabet should be.
retdict = dict()
for key,value in threeDict.iteritems():
try:
mySeq = Seq("".join([PP.to_one_letter_code[i] for i in value]))
retdict[key] = mySeq
except:
pass
return retdict
def get_chainSeqs_from_structure(structure):
"""Expects a Bio.PDB.Structure
Returns a dictionary mapping chain IDs -> lists of polypeptides for that chain
"""
myBuilder = PP.PPBuilder()
retval = dict([(aChain.id,myBuilder.build_peptides(aChain)) for aChain in structure.get_chains()])
return retval
def mapSeqResToATOM(seqres,polypeplist):
"""Returns a list of (singleLetter, <residue> or None) tuples for every residue in the seqres, mapping seqres position to a residue in the model (via provided polypeptide list)
"""
#we could guard against overlap here by specifying the subsection to find in, but is that necessary?
offsetList = [(seqres.find(polypep.get_sequence().tostring()),polypep) for polypep in polypeplist]
retlist = [(i,None) for i in seqres]
#This is much easier to read and cleaner than the old version
#BUT, it assigns areas of memory twice, not the most super-efficient code, but who cares?
for startPos, polypep in offsetList:
end=startPos+len(polypep)
retlist[startPos:end] = zip(seqres[startPos:end],polypep)
return retlist
def mapAll(structfile,structure):
"""Returns a map of chainID -> list of (singleLetter, <residue> or None) tuples for every residue in the seqres
"""
seqRes = get_seqres_from_structfile(structfile)
chainSeqs = get_chainSeqs_from_structure(structure)
retDict = dict()
for key in seqRes.keys():
if not key in chainSeqs.keys():
continue
retDict[key] = mapSeqResToATOM(seqRes[key],chainSeqs[key])
return retDict
if __name__ == "__main__":
#do some tests
plist = PDB.PDBList()
modelfile = plist.retrieve_pdb_file("1CO0",pdir="/tmp/foobar/")
aparser = PDB.PDBParser()
structure = aparser.get_structure("X", modelfile)
foomaps = mapAll(open(modelfile),structure)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment