Skip to content

Instantly share code, notes, and snippets.

@VladVin
Last active July 19, 2023 17:26
Show Gist options
  • Save VladVin/4121720ff984693f9b8c34c29e47f652 to your computer and use it in GitHub Desktop.
Save VladVin/4121720ff984693f9b8c34c29e47f652 to your computer and use it in GitHub Desktop.
ConPLex evaluation for protein-ligand and ligand-ligand matching
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm
from rdkit.Chem.rdmolfiles import MolFromMol2File, MolToSmiles
from metrics import ef_score
def read_litpcba_smiles(file_path):
smiles = pd.read_csv(file_path, header=None, names=['smiles', 'code'], sep=' ')
return smiles['smiles'].tolist()
def read_dude_smiles(file_path):
smiles = pd.read_csv(file_path, header=None, names=['smiles', 'code', 'chembl_code'], sep=' ')
return smiles['smiles'].tolist()
def read_actives_decoys(data_source, target_dir):
if data_source == "litpcba":
actives_fpath = target_dir / "actives.smi"
decoys_fpath = target_dir / "inactives.smi"
read_smiles_fn = read_litpcba_smiles
elif data_source == "dude":
actives_fpath = target_dir / "actives_final.ism"
decoys_fpath = target_dir / "decoys_final.ism"
read_smiles_fn = read_dude_smiles
actives_smiles = read_smiles_fn(actives_fpath)
decoys_smiles = read_smiles_fn(decoys_fpath)
return actives_smiles, decoys_smiles
def read_ligands(data_source, target_dir):
if data_source == "litpcba":
ligands_paths = list(target_dir.glob("*_ligand.mol2"))
elif data_source == "dude":
ligands_paths = [target_dir / "crystal_ligand.mol2"]
ligands = [MolFromMol2File(str(p)) for p in ligands_paths]
total_ligands = len(ligands)
ligands = [lig for lig in ligands if lig is not None]
print(f"Loaded {len(ligands)} out of {total_ligands} ligands")
return ligands
def calc_euclidean_distance(x, y):
dist = (x ** 2).sum(axis=1)[:, None] + (y ** 2).sum(axis=1)[None, :] - 2 * x @ y.T
return -dist
def calc_cosine_distance(x, y):
dist = x @ y.T / (np.linalg.norm(x, axis=1)[:, None] * np.linalg.norm(y, axis=1)[None, :] + 1e-6)
dist = (dist + 1.) / 2.
return dist
def run_benchmark(data_source, data_dir, output_dir, extract_fingerprints_fn, similarity_fn, start_idx=None):
benchmark = {
'target': [],
'auroc_min': [],
'auroc_max': [],
'auroc_mean': [],
'auroc_std': [],
'auroc_fused': [],
'ef1_min': [],
'ef1_max': [],
'ef1_mean': [],
'ef1_std': [],
'ef1_fused': []
}
target_dirs = list(sorted(data_dir.iterdir()))
for t, target_dir in enumerate(target_dirs):
print(f'{target_dir.stem} ({t+1}/{len(target_dirs)})')
target_output_dir = output_dir / target_dir.stem
if start_idx is not None and t < start_idx:
if not target_output_dir.exists():
print(f'[{target_dir.stem}] No output dir found')
continue
ligands_fps = np.load(target_output_dir / 'ligands_feats.npy')
actives_fps = np.load(target_output_dir / 'actives_feats.npy')
decoys_fps = np.load(target_output_dir / 'decoys_feats.npy')
else:
ligands = read_ligands(data_source, target_dir)
if len(ligands) == 0:
print(f'[{target_dir.stem}] No ligands found or they may be corrupted')
continue
actives_smiles, decoys_smiles = read_actives_decoys(data_source, target_dir)
actives_fps = extract_fingerprints_fn(actives_smiles)
decoys_fps = extract_fingerprints_fn(decoys_smiles)
target_output_dir.mkdir(exist_ok=True, parents=True)
np.save(target_output_dir / 'actives_feats.npy', actives_fps)
np.save(target_output_dir / 'decoys_feats.npy', decoys_fps)
ligands_smiles = [MolToSmiles(ligand) for ligand in ligands]
ligands_fps = extract_fingerprints_fn(ligands_smiles)
ligands_fps = np.stack(ligands_fps)
np.save(target_output_dir / 'ligands_feats.npy', ligands_fps)
labels = np.concatenate([np.ones(len(actives_fps), dtype=bool), np.zeros(len(decoys_fps), dtype=bool)])
actives_fps = actives_fps.astype(np.float32)
decoys_fps = decoys_fps.astype(np.float32)
ligands_fps = ligands_fps.astype(np.float32)
lig2act = similarity_fn(ligands_fps, actives_fps)
lig2dec = similarity_fn(ligands_fps, decoys_fps)
roc_aucs, ef1s = [], []
for i in tqdm(range(lig2act.shape[0])):
roc_auc = roc_auc_score(labels, np.concatenate([lig2act[i], lig2dec[i]]))
roc_aucs.append(roc_auc)
ef1 = ef_score(labels, np.concatenate([lig2act[i], lig2dec[i]]), 0.01)
ef1s.append(ef1)
roc_auc_fused = roc_auc_score(labels, np.concatenate([lig2act.max(axis=0), lig2dec.max(axis=0)]))
ef1_fused = ef_score(labels, np.concatenate([lig2act.max(axis=0), lig2dec.max(axis=0)]), 0.01)
benchmark['target'].append(target_dir.stem)
benchmark['auroc_min'].append(np.min(roc_aucs))
benchmark['auroc_max'].append(np.max(roc_aucs))
benchmark['auroc_mean'].append(np.mean(roc_aucs))
benchmark['auroc_std'].append(np.std(roc_aucs))
benchmark['auroc_fused'].append(roc_auc_fused)
benchmark['ef1_min'].append(np.min(ef1s))
benchmark['ef1_max'].append(np.max(ef1s))
benchmark['ef1_mean'].append(np.mean(ef1s))
benchmark['ef1_std'].append(np.std(ef1s))
benchmark['ef1_fused'].append(ef1_fused)
print(f'[{target_dir.stem}] ROC AUC min: {np.min(roc_aucs):.3f}, max: {np.max(roc_aucs):.3f}, '
f'mean: {np.mean(roc_aucs):.3f}, std: {np.std(roc_aucs):.3f} fused: {roc_auc_fused:.3f}')
print(f'[{target_dir.stem}] EF1 min: {np.min(ef1s):.3f}, max: {np.max(ef1s):.3f}, '
f'mean: {np.mean(ef1s):.3f}, std: {np.std(ef1s):.3f} fused: {ef1_fused:.3f}')
benchmark = pd.DataFrame(benchmark)
return benchmark
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment