bryan-lunt/mapseqres.py

## mapseqres.py
'''
Created on Jun 29, 2010

@author: lunt@ctbp.ucsd.edu
'''

from Bio.Seq import Seq
import Bio.PDB as PDB
import Bio.PDB.Polypeptide as PP
from itertools import izip, count

def get_seqres_from_structfile(structfile):
	"""Expects an opened handle to the pdbXXXX.ent file to be read
	Returns a dict of ChainID -> Bio.Seq.Seq objects representing the seqres entries of the pdb file.
	"""
	lines = list()
	#get all SEQRES entries
	for line in structfile:
		if line.startswith("SEQRES"):
			lines.append(line)

	threeDict = dict()
	for line in lines:
		chainID = line[11]
		ress = line[19:]
		resList = threeDict.setdefault(chainID,list())
		resList.extend(ress.split())

	#TODO Infer for each sequences what its alphabet should be.
	retdict = dict()
	for key,value in threeDict.iteritems():
		try:
			mySeq = Seq("".join([PP.to_one_letter_code[i] for i in value]))
			retdict[key] = mySeq
		except:
			pass
	return retdict

def get_chainSeqs_from_structure(structure):
	"""Expects a Bio.PDB.Structure
	Returns a dictionary mapping chain IDs -> lists of polypeptides for that chain
	"""
	myBuilder = PP.PPBuilder()
	retval = dict([(aChain.id,myBuilder.build_peptides(aChain)) for aChain in structure.get_chains()])
	return retval

def mapSeqResToATOM(seqres,polypeplist):
	"""Returns a list of  (singleLetter, <residue> or None) tuples for every residue in the seqres, mapping seqres position to a residue in the model (via provided polypeptide list)
	"""
	#we could guard against overlap here by specifying the subsection to find in, but is that necessary?
	offsetList = [(seqres.find(polypep.get_sequence().tostring()),polypep) for polypep in polypeplist]
	retlist = [(i,None) for i in seqres]

	#This is much easier to read and cleaner than the old version
	#BUT, it assigns areas of memory twice, not the most super-efficient code, but who cares?
	for startPos, polypep in offsetList:
		end=startPos+len(polypep)
		retlist[startPos:end] = zip(seqres[startPos:end],polypep)
	return retlist

def mapAll(structfile,structure):
	"""Returns a map of chainID -> list of (singleLetter, <residue> or None) tuples for every residue in the seqres
	"""
	seqRes = get_seqres_from_structfile(structfile)
	chainSeqs = get_chainSeqs_from_structure(structure)

	retDict = dict()
	for key in seqRes.keys():
		if not key in chainSeqs.keys():
			continue
		retDict[key] = mapSeqResToATOM(seqRes[key],chainSeqs[key])
	return retDict


if __name__ == "__main__":
	#do some tests
	plist = PDB.PDBList()
	modelfile = plist.retrieve_pdb_file("1CO0",pdir="/tmp/foobar/")
	aparser = PDB.PDBParser()
	structure = aparser.get_structure("X", modelfile)
	foomaps = mapAll(open(modelfile),structure)
	'''
	Created on Jun 29, 2010

	@author: lunt@ctbp.ucsd.edu
	'''

	from Bio.Seq import Seq
	import Bio.PDB as PDB
	import Bio.PDB.Polypeptide as PP
	from itertools import izip, count

	def get_seqres_from_structfile(structfile):
	"""Expects an opened handle to the pdbXXXX.ent file to be read
	Returns a dict of ChainID -> Bio.Seq.Seq objects representing the seqres entries of the pdb file.
	"""
	lines = list()
	#get all SEQRES entries
	for line in structfile:
	if line.startswith("SEQRES"):
	lines.append(line)

	threeDict = dict()
	for line in lines:
	chainID = line[11]
	ress = line[19:]
	resList = threeDict.setdefault(chainID,list())
	resList.extend(ress.split())

	#TODO Infer for each sequences what its alphabet should be.
	retdict = dict()
	for key,value in threeDict.iteritems():
	try:
	mySeq = Seq("".join([PP.to_one_letter_code[i] for i in value]))
	retdict[key] = mySeq
	except:
	pass
	return retdict

	def get_chainSeqs_from_structure(structure):
	"""Expects a Bio.PDB.Structure
	Returns a dictionary mapping chain IDs -> lists of polypeptides for that chain
	"""
	myBuilder = PP.PPBuilder()
	retval = dict([(aChain.id,myBuilder.build_peptides(aChain)) for aChain in structure.get_chains()])
	return retval

	def mapSeqResToATOM(seqres,polypeplist):
	"""Returns a list of (singleLetter, <residue> or None) tuples for every residue in the seqres, mapping seqres position to a residue in the model (via provided polypeptide list)
	"""
	#we could guard against overlap here by specifying the subsection to find in, but is that necessary?
	offsetList = [(seqres.find(polypep.get_sequence().tostring()),polypep) for polypep in polypeplist]
	retlist = [(i,None) for i in seqres]

	#This is much easier to read and cleaner than the old version
	#BUT, it assigns areas of memory twice, not the most super-efficient code, but who cares?
	for startPos, polypep in offsetList:
	end=startPos+len(polypep)
	retlist[startPos:end] = zip(seqres[startPos:end],polypep)
	return retlist

	def mapAll(structfile,structure):
	"""Returns a map of chainID -> list of (singleLetter, <residue> or None) tuples for every residue in the seqres
	"""
	seqRes = get_seqres_from_structfile(structfile)
	chainSeqs = get_chainSeqs_from_structure(structure)

	retDict = dict()
	for key in seqRes.keys():
	if not key in chainSeqs.keys():
	continue
	retDict[key] = mapSeqResToATOM(seqRes[key],chainSeqs[key])
	return retDict


	if __name__ == "__main__":
	#do some tests
	plist = PDB.PDBList()
	modelfile = plist.retrieve_pdb_file("1CO0",pdir="/tmp/foobar/")
	aparser = PDB.PDBParser()
	structure = aparser.get_structure("X", modelfile)
	foomaps = mapAll(open(modelfile),structure)