-
-
Save VladVin/4121720ff984693f9b8c34c29e47f652 to your computer and use it in GitHub Desktop.
ConPLex evaluation for protein-ligand and ligand-ligand matching
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.metrics import roc_auc_score | |
from tqdm.notebook import tqdm | |
from rdkit.Chem.rdmolfiles import MolFromMol2File, MolToSmiles | |
from metrics import ef_score | |
def read_litpcba_smiles(file_path): | |
smiles = pd.read_csv(file_path, header=None, names=['smiles', 'code'], sep=' ') | |
return smiles['smiles'].tolist() | |
def read_dude_smiles(file_path): | |
smiles = pd.read_csv(file_path, header=None, names=['smiles', 'code', 'chembl_code'], sep=' ') | |
return smiles['smiles'].tolist() | |
def read_actives_decoys(data_source, target_dir): | |
if data_source == "litpcba": | |
actives_fpath = target_dir / "actives.smi" | |
decoys_fpath = target_dir / "inactives.smi" | |
read_smiles_fn = read_litpcba_smiles | |
elif data_source == "dude": | |
actives_fpath = target_dir / "actives_final.ism" | |
decoys_fpath = target_dir / "decoys_final.ism" | |
read_smiles_fn = read_dude_smiles | |
actives_smiles = read_smiles_fn(actives_fpath) | |
decoys_smiles = read_smiles_fn(decoys_fpath) | |
return actives_smiles, decoys_smiles | |
def read_ligands(data_source, target_dir): | |
if data_source == "litpcba": | |
ligands_paths = list(target_dir.glob("*_ligand.mol2")) | |
elif data_source == "dude": | |
ligands_paths = [target_dir / "crystal_ligand.mol2"] | |
ligands = [MolFromMol2File(str(p)) for p in ligands_paths] | |
total_ligands = len(ligands) | |
ligands = [lig for lig in ligands if lig is not None] | |
print(f"Loaded {len(ligands)} out of {total_ligands} ligands") | |
return ligands | |
def calc_euclidean_distance(x, y): | |
dist = (x ** 2).sum(axis=1)[:, None] + (y ** 2).sum(axis=1)[None, :] - 2 * x @ y.T | |
return -dist | |
def calc_cosine_distance(x, y): | |
dist = x @ y.T / (np.linalg.norm(x, axis=1)[:, None] * np.linalg.norm(y, axis=1)[None, :] + 1e-6) | |
dist = (dist + 1.) / 2. | |
return dist | |
def run_benchmark(data_source, data_dir, output_dir, extract_fingerprints_fn, similarity_fn, start_idx=None): | |
benchmark = { | |
'target': [], | |
'auroc_min': [], | |
'auroc_max': [], | |
'auroc_mean': [], | |
'auroc_std': [], | |
'auroc_fused': [], | |
'ef1_min': [], | |
'ef1_max': [], | |
'ef1_mean': [], | |
'ef1_std': [], | |
'ef1_fused': [] | |
} | |
target_dirs = list(sorted(data_dir.iterdir())) | |
for t, target_dir in enumerate(target_dirs): | |
print(f'{target_dir.stem} ({t+1}/{len(target_dirs)})') | |
target_output_dir = output_dir / target_dir.stem | |
if start_idx is not None and t < start_idx: | |
if not target_output_dir.exists(): | |
print(f'[{target_dir.stem}] No output dir found') | |
continue | |
ligands_fps = np.load(target_output_dir / 'ligands_feats.npy') | |
actives_fps = np.load(target_output_dir / 'actives_feats.npy') | |
decoys_fps = np.load(target_output_dir / 'decoys_feats.npy') | |
else: | |
ligands = read_ligands(data_source, target_dir) | |
if len(ligands) == 0: | |
print(f'[{target_dir.stem}] No ligands found or they may be corrupted') | |
continue | |
actives_smiles, decoys_smiles = read_actives_decoys(data_source, target_dir) | |
actives_fps = extract_fingerprints_fn(actives_smiles) | |
decoys_fps = extract_fingerprints_fn(decoys_smiles) | |
target_output_dir.mkdir(exist_ok=True, parents=True) | |
np.save(target_output_dir / 'actives_feats.npy', actives_fps) | |
np.save(target_output_dir / 'decoys_feats.npy', decoys_fps) | |
ligands_smiles = [MolToSmiles(ligand) for ligand in ligands] | |
ligands_fps = extract_fingerprints_fn(ligands_smiles) | |
ligands_fps = np.stack(ligands_fps) | |
np.save(target_output_dir / 'ligands_feats.npy', ligands_fps) | |
labels = np.concatenate([np.ones(len(actives_fps), dtype=bool), np.zeros(len(decoys_fps), dtype=bool)]) | |
actives_fps = actives_fps.astype(np.float32) | |
decoys_fps = decoys_fps.astype(np.float32) | |
ligands_fps = ligands_fps.astype(np.float32) | |
lig2act = similarity_fn(ligands_fps, actives_fps) | |
lig2dec = similarity_fn(ligands_fps, decoys_fps) | |
roc_aucs, ef1s = [], [] | |
for i in tqdm(range(lig2act.shape[0])): | |
roc_auc = roc_auc_score(labels, np.concatenate([lig2act[i], lig2dec[i]])) | |
roc_aucs.append(roc_auc) | |
ef1 = ef_score(labels, np.concatenate([lig2act[i], lig2dec[i]]), 0.01) | |
ef1s.append(ef1) | |
roc_auc_fused = roc_auc_score(labels, np.concatenate([lig2act.max(axis=0), lig2dec.max(axis=0)])) | |
ef1_fused = ef_score(labels, np.concatenate([lig2act.max(axis=0), lig2dec.max(axis=0)]), 0.01) | |
benchmark['target'].append(target_dir.stem) | |
benchmark['auroc_min'].append(np.min(roc_aucs)) | |
benchmark['auroc_max'].append(np.max(roc_aucs)) | |
benchmark['auroc_mean'].append(np.mean(roc_aucs)) | |
benchmark['auroc_std'].append(np.std(roc_aucs)) | |
benchmark['auroc_fused'].append(roc_auc_fused) | |
benchmark['ef1_min'].append(np.min(ef1s)) | |
benchmark['ef1_max'].append(np.max(ef1s)) | |
benchmark['ef1_mean'].append(np.mean(ef1s)) | |
benchmark['ef1_std'].append(np.std(ef1s)) | |
benchmark['ef1_fused'].append(ef1_fused) | |
print(f'[{target_dir.stem}] ROC AUC min: {np.min(roc_aucs):.3f}, max: {np.max(roc_aucs):.3f}, ' | |
f'mean: {np.mean(roc_aucs):.3f}, std: {np.std(roc_aucs):.3f} fused: {roc_auc_fused:.3f}') | |
print(f'[{target_dir.stem}] EF1 min: {np.min(ef1s):.3f}, max: {np.max(ef1s):.3f}, ' | |
f'mean: {np.mean(ef1s):.3f}, std: {np.std(ef1s):.3f} fused: {ef1_fused:.3f}') | |
benchmark = pd.DataFrame(benchmark) | |
return benchmark |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment