Skip to content

Instantly share code, notes, and snippets.

@fomightez
Forked from JoaoRodrigues/bio_align.py
Created June 6, 2016 13:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fomightez/2601c0f5a13b85cd21b9377169c79836 to your computer and use it in GitHub Desktop.
Save fomightez/2601c0f5a13b85cd21b9377169c79836 to your computer and use it in GitHub Desktop.
Sequence-based structure alignment of protein structures with Biopython
#!/usr/bin/env python
"""
Sequence-based structural alignment of two proteins.
"""
from __future__ import print_function, division
import argparse
import os
from Bio.PDB import PDBParser, FastMMCIFParser, Superimposer, PDBIO
from Bio.PDB.Polypeptide import is_aa
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
from Bio.Data.SCOPData import protein_letters_3to1 as aa3to1
def align_sequences(structA, structB, **kwargs):
"""
Performs a global pairwise alignment between two sequences
using the BLOSUM62 matrix and the Needleman-Wunsch algorithm
as implemented in Biopython. Returns the alignment, the sequence
identity and the residue mapping between both original sequences.
"""
def _calculate_identity(sequenceA, sequenceB):
"""
Returns the percentage of identical characters between two sequences.
Assumes the sequences are aligned.
"""
sa, sb, sl = sequenceA, sequenceB, len(sequenceA)
matches = [sa[i] == sb[i] for i in xrange(sl)]
seq_id = (100 * sum(matches)) / sl
gapless_sl = sum([1 for i in xrange(sl) if (sa[i] != '-' and sb[i] != '-')])
gap_id = (100 * sum(matches)) / gapless_sl
return (seq_id, gap_id)
def _get_pdb_sequence(structure):
"""
Retrieves the AA sequence from a PDB structure.
"""
_aainfo = lambda r: (r.id[1], aa3to1.get(r.resname, 'X'))
seq = [_aainfo(r) for r in structure.get_residues() if is_aa(r)]
return seq
matrix = kwargs.get('matrix', matlist.blosum62)
gap_open = kwargs.get('gap_open', -10.0)
gap_extend = kwargs.get('gap_extend', -0.5)
trim_ends = kwargs.get('trim_ends', True)
resseq_A = _get_pdb_sequence(structA)
resseq_B = _get_pdb_sequence(structB)
sequence_A = ''.join([i[1] for i in resseq_A])
sequence_B = ''.join([i[1] for i in resseq_B])
alns = pairwise2.align.globalds(sequence_A, sequence_B,
matrix, gap_open, gap_extend,
penalize_end_gaps=(False, False) )
best_aln = alns[0]
aligned_A, aligned_B, score, begin, end = best_aln
# Equivalent residue numbering
# Relative to reference
mapping = {}
aa_i_A, aa_i_B = 0, 0
for aln_i, (aa_aln_A, aa_aln_B) in enumerate(zip(aligned_A, aligned_B)):
if aa_aln_A == '-':
if aa_aln_B != '-':
aa_i_B += 1
elif aa_aln_B == '-':
if aa_aln_A != '-':
aa_i_A += 1
else:
assert resseq_A[aa_i_A][1] == aa_aln_A
assert resseq_B[aa_i_B][1] == aa_aln_B
mapping[resseq_A[aa_i_A][0]] = resseq_B[aa_i_B][0]
aa_i_A += 1
aa_i_B += 1
# Gapless alignment
# def _trimmer(sequence):
# """Returns indices of first and last ungapped position"""
# leading = [i for (i, aa) in enumerate(sequence) if aa != '-'][0]
# trailing = [i for (i, aa) in enumerate(sequence[::-1]) if aa != '-'][0]
# trailing = len(sequence) - trailing
# return (leading, trailing)
# lead_A, trail_A = _trimmer(aligned_A)
# lead_B, trail_B = _trimmer(aligned_B)
# lead = max(lead_A, lead_B)
# trail = min(trail_A, trail_B)
# trim_aln_A = aligned_A[lead:trail]
# trim_aln_B = aligned_B[lead:trail]
# mismatch = ''.join(['+' if a!=b else ' ' for (a,b) in zip(trim_aln_A, trim_aln_B)])
# Calculate (gapless) sequence identity
seq_id, g_seq_id = _calculate_identity(aligned_A, aligned_B)
return ((aligned_A, aligned_B), seq_id, g_seq_id, mapping)
# return ((trim_aln_A, trim_aln_B, mismatch), seq_id, g_seq_id, mapping)
def parse_structure(spath):
"""Parses a PDB/cif structure"""
if not os.path.isfile(spath):
return IOError('File not found: {0}'.format(spath))
if spath.endswith(('pdb', 'ent')):
parser = PDBParser()
elif spath.endswith('cif'):
parser = FastMMCIFParser()
else:
raise Exception('Format not supported ({0}). Must be .pdb/.ent or .cif'.format(spath))
sname = os.path.basename(spath.split('.')[0])
return parser.get_structure(sname, spath)
##
if __name__ == '__main__':
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('refe', help='Reference Structure')
ap.add_argument('--r_chain', default='A', help='Reference Structure Chain')
ap.add_argument('mobi', help='Mobile Structure')
ap.add_argument('--m_chain', default='A', help='Reference Structure Chain')
cline = ap.parse_args()
# Parse structures & take only the necessary chain
s_reference = parse_structure(cline.refe)
try:
reference = s_reference[0][cline.r_chain]
except KeyError:
raise Exception('Chain {0} not found in reference structure'.format(cline.r_chain))
s_mobile = parse_structure(cline.mobi)
try:
mobile = s_mobile[0][cline.m_chain]
except KeyError:
raise Exception('Chain {0} not found in mobile structure'.format(cline.m_chain))
# Align sequences to get mapping between residues
aln, seq_id, gapless_id, res_map = align_sequences(reference, mobile)
refe_ca_list, mobi_ca_list = [], []
for refe_res in res_map:
refe_ca_list.append(reference[refe_res]['CA'])
mobi_ca_list.append(mobile[res_map[refe_res]]['CA'])
# Superimpose matching residues
si = Superimposer()
si.set_atoms(refe_ca_list, mobi_ca_list)
# Transform & Write Mobile
si.apply(mobile.get_atoms())
io = PDBIO()
io.set_structure(mobile)
m_transformed_name = '{0}_transformed.pdb'.format(s_mobile.id)
io.save(m_transformed_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment