Skip to content

Instantly share code, notes, and snippets.

@edraizen
Created September 2, 2022 21:03
Show Gist options
  • Save edraizen/14733e65d6142126dd21c816e44bd50f to your computer and use it in GitHub Desktop.
Save edraizen/14733e65d6142126dd21c816e44bd50f to your computer and use it in GitHub Desktop.
Calculated PDB contacts and outputs a similar form to protein contact atlas
from pathlib import Path
import click
import numpy as np
import pandas as pd
from biotite.structure.io.pdb import PDBFile
from sklearn.metrics import pairwise_distances
@click.command()
@click.argument('pdb_file', required=1, type=click.Path(exists=True))
@click.option('--chain', default=None, help='Only use specific chain in PDB file else use all')
@click.option('--out_file', default=None, help='File to output contact else use stdout')
@click.option('--max_cutoff', default=5, type=float, help='Distance cutoff to use')
def calculate_contacts(pdb_file, chain=None, out_file=None, max_cutoff=5):
pdb_file = Path(pdb_file)
pdb_name = pdb_file.stem
if out_file is None:
out_file = pdb_file.with_suffix(".pdb.contacts")
atom_array = PDBFile.read(str(pdb_file)).get_structure(model=1, altloc='first', extra_fields=['atom_id', 'b_factor', 'occupancy'])
atom_array = atom_array[atom_array.element!="H"]
if chain is not None:
atom_array = atom_array[atom_array.chain_id==chain]
dist = pairwise_distances(atom_array.coord)
dist[np.tril_indices(len(dist), k=0, m=None)] = np.nan
dist2 = pd.DataFrame(
dist,
index=pd.MultiIndex.from_tuples(
zip(atom_array.atom_id, atom_array.chain_id, atom_array.res_name, atom_array.res_id, atom_array.ins_code, atom_array.atom_name),
name=["atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name"]),
columns=pd.MultiIndex.from_tuples(
zip(atom_array.atom_id, atom_array.chain_id, atom_array.res_name, atom_array.res_id, atom_array.ins_code, atom_array.atom_name),
name=["atom1_index", "atom1_chain_id", "atom1_res_name", "atom1_res_id", "atom1_ins_code", "atom1_atom_name"]),
)
dist3 = dist2.reset_index().melt(
id_vars=["atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name"],
value_name="distance").dropna()
dist3 = dist3[["atom1_index", "atom1_chain_id", "atom1_res_name", "atom1_res_id", "atom1_ins_code", "atom1_atom_name",
"atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name", "distance"]]
dist3 = dist3.sort_values(["atom1_chain_id", "atom1_res_id", "atom1_ins_code", "atom2_chain_id", "atom2_res_id", "atom2_ins_code"])
close = dist3[
(dist3.distance<max_cutoff)&
(dist3.distance>0)&
(dist3.atom1_res_id.astype(str)+dist3.atom1_ins_code!=dist3.atom2_res_id.astype(str)+dist3.atom2_ins_code)]
with out_file.open("w") as f:
print("PDB", "Chain1", "Chain2", "Res1", "ResNum1", "Res2", "ResNum2", "Number of atomic contacts", "Atoms", "Chain Types", "Distance", sep="\t", file=f)
for res1, group in close.groupby(["atom1_chain_id", "atom1_res_id", "atom1_ins_code", "atom2_chain_id", "atom2_res_id", "atom2_ins_code"]):
for row in group.itertuples():
atom1_type = "M" if row.atom1_atom_name in ["C", "N", "O", "CA"] else "S"
atom2_type = "M" if row.atom2_atom_name in ["C", "N", "O", "CA"] else "S"
print(
pdb_name,
row.atom1_chain_id,
row.atom2_chain_id,
row.atom1_res_name,
f"{row.atom1_res_id}{row.atom1_ins_code}",
row.atom2_res_name,
f"{row.atom2_res_id}{row.atom2_ins_code}",
len(group),
row.atom1_atom_name+"-"+row.atom2_atom_name,
f"{atom1_type}-{atom2_type}",
f"{row.distance:0.4f}", sep="\t", file=f)
if __name__ == "__main__":
calculate_contacts()
click
numpy
pandas
biotite
sklearn
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment