Skip to content

Instantly share code, notes, and snippets.

@JoaoRodrigues
Last active January 6, 2023 22:10
Show Gist options
  • Save JoaoRodrigues/e3a4f2139d10888c679eb1657a4d7080 to your computer and use it in GitHub Desktop.
Save JoaoRodrigues/e3a4f2139d10888c679eb1657a4d7080 to your computer and use it in GitHub Desktop.
Sequence-based structure alignment of protein structures with Biopython
#!/usr/bin/env python
"""Sequence-based structural alignment of two proteins."""
import argparse
import pathlib
from Bio.PDB import FastMMCIFParser, MMCIFIO, PDBParser, PDBIO, Superimposer
from Bio.PDB.Polypeptide import is_aa
from Bio import pairwise2
from Bio.Align import substitution_matrices
from Bio.Data.SCOPData import protein_letters_3to1 as aa3to1
def align_sequences(structA, structB):
"""
Performs a global pairwise alignment between two sequences
using the BLOSUM62 matrix and the Needleman-Wunsch algorithm
as implemented in Biopython. Returns the alignment, the sequence
identity and the residue mapping between both original sequences.
"""
def _get_pdb_sequence(structure):
"""
Retrieves the AA sequence from a PDB structure.
"""
_aainfo = lambda r: (r.id[1], aa3to1.get(r.resname, "X"))
seq = [_aainfo(r) for r in structure.get_residues() if is_aa(r)]
return seq
resseq_A = _get_pdb_sequence(structA)
resseq_B = _get_pdb_sequence(structB)
sequence_A = "".join([i[1] for i in resseq_A])
sequence_B = "".join([i[1] for i in resseq_B])
alns = pairwise2.align.globalds(
sequence_A,
sequence_B,
substitution_matrices.load("BLOSUM62"),
one_alignment_only=True,
open=-10.0,
extend=-0.5,
penalize_end_gaps=(False, False),
)
best_aln = alns[0]
aligned_A, aligned_B, score, begin, end = best_aln
# Equivalent residue numbering
# Relative to reference
mapping = {}
aa_i_A, aa_i_B = 0, 0
for aln_i, (aa_aln_A, aa_aln_B) in enumerate(zip(aligned_A, aligned_B)):
if aa_aln_A == "-":
if aa_aln_B != "-":
aa_i_B += 1
elif aa_aln_B == "-":
if aa_aln_A != "-":
aa_i_A += 1
else:
assert resseq_A[aa_i_A][1] == aa_aln_A
assert resseq_B[aa_i_B][1] == aa_aln_B
mapping[resseq_A[aa_i_A][0]] = resseq_B[aa_i_B][0]
aa_i_A += 1
aa_i_B += 1
return mapping
def parse_structure(filepath):
"""Parse a PDB/cif structure."""
try:
filepath.resolve(strict=True)
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {filepath}")
if filepath.suffix in {".pdb", ".ent"}:
parser = PDBParser()
elif filepath.suffix in {".cif", ".mmcif"}:
parser = FastMMCIFParser()
else:
raise ValueError(
f"Unsupported input structure format: {filepath.suffix}"
)
return parser.get_structure(filepath.name, str(filepath))
if __name__ == "__main__":
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument(
"refe",
type=pathlib.Path,
help="Reference Structure",
)
ap.add_argument(
"mobi",
type=pathlib.Path,
help="Mobile Structure",
)
ap.add_argument(
"--r_chain",
default="A",
help="Reference Structure Chain",
)
ap.add_argument(
"--m_chain",
default="A",
help="Mobile Structure Chain",
)
ap.add_argument(
"-o",
"--output",
type=pathlib.Path,
default=None,
help="Name of output aligned structure.",
)
args = ap.parse_args()
# Parse structures & take only the necessary chain
s_reference = parse_structure(args.refe)
try:
reference = s_reference[0][args.r_chain]
except KeyError:
raise Exception(f"Chain {args.r_chain} not found in reference.")
s_mobile = parse_structure(args.mobi)
try:
mobile = s_mobile[0][args.m_chain]
except KeyError:
raise Exception(f"Chain {args.m_chain} not found in mobile.")
# Align sequences to get mapping between residues
mapping = align_sequences(reference, mobile)
refe_ca_list, mobi_ca_list = [], []
for refe_res in mapping:
refe_ca_list.append(reference[refe_res]["CA"])
mobi_ca_list.append(mobile[mapping[refe_res]]["CA"])
# Superimpose matching residues
si = Superimposer()
si.set_atoms(refe_ca_list, mobi_ca_list)
si.apply(mobile.get_atoms())
print(f"RMSD between structures: {si.rms:4.2f}")
# Write aligned mobile
if args.output is None:
ext = args.mobi.suffix
args.output = args.mobi.with_suffix(f".aligned{ext}")
if args.output.suffix in {".cif", ".mmcif"}:
io = MMCIFIO()
elif args.output.suffix in {".pdb", ".ent"}:
io = PDBIO()
else:
print("Output format not recognized. Defaulting to mmcIF.")
io = MMCIFIO()
io.set_structure(mobile)
io.save(str(args.output))
@ajasja
Copy link

ajasja commented Jan 6, 2023

Thank you, this is great! I was just about to write this (but googling is faster:)
I think part of the code in main() could be extracted so that there is also an align(mobile, ref) function.
Then this file could be used a python module as well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment