edraizen/calculate_contacts.py

## calculate_contacts.py
from pathlib import Path

import click
import numpy as np
import pandas as pd
from biotite.structure.io.pdb import PDBFile
from sklearn.metrics import pairwise_distances

@click.command()
@click.argument('pdb_file', required=1, type=click.Path(exists=True))
@click.option('--chain', default=None, help='Only use specific chain in PDB file else use all')
@click.option('--out_file', default=None, help='File to output contact else use stdout')
@click.option('--max_cutoff', default=5, type=float, help='Distance cutoff to use')
def calculate_contacts(pdb_file, chain=None, out_file=None, max_cutoff=5):
    pdb_file = Path(pdb_file)
    pdb_name = pdb_file.stem

    if out_file is None:
        out_file = pdb_file.with_suffix(".pdb.contacts")

    atom_array = PDBFile.read(str(pdb_file)).get_structure(model=1, altloc='first', extra_fields=['atom_id', 'b_factor', 'occupancy'])
    atom_array = atom_array[atom_array.element!="H"]

    if chain is not None:
        atom_array = atom_array[atom_array.chain_id==chain]

    dist = pairwise_distances(atom_array.coord)
    dist[np.tril_indices(len(dist), k=0, m=None)] = np.nan


    dist2 = pd.DataFrame(
        dist,
        index=pd.MultiIndex.from_tuples(
            zip(atom_array.atom_id, atom_array.chain_id, atom_array.res_name, atom_array.res_id, atom_array.ins_code, atom_array.atom_name),
            name=["atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name"]),
        columns=pd.MultiIndex.from_tuples(
            zip(atom_array.atom_id, atom_array.chain_id, atom_array.res_name, atom_array.res_id, atom_array.ins_code, atom_array.atom_name),
            name=["atom1_index", "atom1_chain_id", "atom1_res_name", "atom1_res_id", "atom1_ins_code", "atom1_atom_name"]),
    )


    dist3 = dist2.reset_index().melt(
        id_vars=["atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name"],
        value_name="distance").dropna()
    dist3 = dist3[["atom1_index", "atom1_chain_id", "atom1_res_name", "atom1_res_id", "atom1_ins_code", "atom1_atom_name",
                "atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name", "distance"]]
    dist3 = dist3.sort_values(["atom1_chain_id", "atom1_res_id", "atom1_ins_code", "atom2_chain_id", "atom2_res_id", "atom2_ins_code"])

    close = dist3[
        (dist3.distance<max_cutoff)&
        (dist3.distance>0)&
        (dist3.atom1_res_id.astype(str)+dist3.atom1_ins_code!=dist3.atom2_res_id.astype(str)+dist3.atom2_ins_code)]

    with out_file.open("w") as f:
        print("PDB", "Chain1", "Chain2", "Res1", "ResNum1", "Res2", "ResNum2", "Number of atomic contacts",	"Atoms", "Chain Types", "Distance", sep="\t", file=f)
        for res1, group in close.groupby(["atom1_chain_id", "atom1_res_id", "atom1_ins_code", "atom2_chain_id", "atom2_res_id", "atom2_ins_code"]):
            for row in group.itertuples():
                atom1_type = "M" if row.atom1_atom_name in ["C", "N", "O", "CA"] else "S"
                atom2_type = "M" if row.atom2_atom_name in ["C", "N", "O", "CA"] else "S"
                print(
                    pdb_name,
                    row.atom1_chain_id,
                    row.atom2_chain_id,
                    row.atom1_res_name,
                    f"{row.atom1_res_id}{row.atom1_ins_code}",
                    row.atom2_res_name,
                    f"{row.atom2_res_id}{row.atom2_ins_code}",
                    len(group),
                    row.atom1_atom_name+"-"+row.atom2_atom_name,
                    f"{atom1_type}-{atom2_type}",
                    f"{row.distance:0.4f}", sep="\t", file=f)

if __name__ == "__main__":
    calculate_contacts()

## requirements.txt
click
numpy
pandas
biotite
sklearn
	from pathlib import Path

	import click
	import numpy as np
	import pandas as pd
	from biotite.structure.io.pdb import PDBFile
	from sklearn.metrics import pairwise_distances

	@click.command()
	@click.argument('pdb_file', required=1, type=click.Path(exists=True))
	@click.option('--chain', default=None, help='Only use specific chain in PDB file else use all')
	@click.option('--out_file', default=None, help='File to output contact else use stdout')
	@click.option('--max_cutoff', default=5, type=float, help='Distance cutoff to use')
	def calculate_contacts(pdb_file, chain=None, out_file=None, max_cutoff=5):
	pdb_file = Path(pdb_file)
	pdb_name = pdb_file.stem

	if out_file is None:
	out_file = pdb_file.with_suffix(".pdb.contacts")

	atom_array = PDBFile.read(str(pdb_file)).get_structure(model=1, altloc='first', extra_fields=['atom_id', 'b_factor', 'occupancy'])
	atom_array = atom_array[atom_array.element!="H"]

	if chain is not None:
	atom_array = atom_array[atom_array.chain_id==chain]

	dist = pairwise_distances(atom_array.coord)
	dist[np.tril_indices(len(dist), k=0, m=None)] = np.nan


	dist2 = pd.DataFrame(
	dist,
	index=pd.MultiIndex.from_tuples(
	zip(atom_array.atom_id, atom_array.chain_id, atom_array.res_name, atom_array.res_id, atom_array.ins_code, atom_array.atom_name),
	name=["atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name"]),
	columns=pd.MultiIndex.from_tuples(
	zip(atom_array.atom_id, atom_array.chain_id, atom_array.res_name, atom_array.res_id, atom_array.ins_code, atom_array.atom_name),
	name=["atom1_index", "atom1_chain_id", "atom1_res_name", "atom1_res_id", "atom1_ins_code", "atom1_atom_name"]),
	)


	dist3 = dist2.reset_index().melt(
	id_vars=["atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name"],
	value_name="distance").dropna()
	dist3 = dist3[["atom1_index", "atom1_chain_id", "atom1_res_name", "atom1_res_id", "atom1_ins_code", "atom1_atom_name",
	"atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name", "distance"]]
	dist3 = dist3.sort_values(["atom1_chain_id", "atom1_res_id", "atom1_ins_code", "atom2_chain_id", "atom2_res_id", "atom2_ins_code"])

	close = dist3[
	(dist3.distance<max_cutoff)&
	(dist3.distance>0)&
	(dist3.atom1_res_id.astype(str)+dist3.atom1_ins_code!=dist3.atom2_res_id.astype(str)+dist3.atom2_ins_code)]

	with out_file.open("w") as f:
	print("PDB", "Chain1", "Chain2", "Res1", "ResNum1", "Res2", "ResNum2", "Number of atomic contacts", "Atoms", "Chain Types", "Distance", sep="\t", file=f)
	for res1, group in close.groupby(["atom1_chain_id", "atom1_res_id", "atom1_ins_code", "atom2_chain_id", "atom2_res_id", "atom2_ins_code"]):
	for row in group.itertuples():
	atom1_type = "M" if row.atom1_atom_name in ["C", "N", "O", "CA"] else "S"
	atom2_type = "M" if row.atom2_atom_name in ["C", "N", "O", "CA"] else "S"
	print(
	pdb_name,
	row.atom1_chain_id,
	row.atom2_chain_id,
	row.atom1_res_name,
	f"{row.atom1_res_id}{row.atom1_ins_code}",
	row.atom2_res_name,
	f"{row.atom2_res_id}{row.atom2_ins_code}",
	len(group),
	row.atom1_atom_name+"-"+row.atom2_atom_name,
	f"{atom1_type}-{atom2_type}",
	f"{row.distance:0.4f}", sep="\t", file=f)

	if __name__ == "__main__":
	calculate_contacts()