Skip to content

Instantly share code, notes, and snippets.

@n-yoshikawa
Last active October 19, 2018 07:21
Show Gist options
  • Save n-yoshikawa/6c8ca102dd2b711415952a74b2459c03 to your computer and use it in GitHub Desktop.
Save n-yoshikawa/6c8ca102dd2b711415952a74b2459c03 to your computer and use it in GitHub Desktop.
Evaluation of fragment-based coordinate generation
import matplotlib.pyplot as plt
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
original_spl = Chem.SDMolSupplier('platinum_dataset_2017_01.sdf')
predicted_spl = Chem.SDMolSupplier('platinum-openbabel.sdf')
original = []
for mol in original_spl:
original.append(mol)
predicted = []
for mol in predicted_spl:
predicted.append(mol)
rmsd = []
for mol1, mol2 in zip(original, predicted):
rmsd.append(AllChem.GetBestRMS(mol1, mol2))
print(np.mean(rmsd))
print(len(original), len(predicted))
plt.title("Distribution of RMSD (OpenBabel)")
plt.xlabel("RMSD")
plt.ylabel("Number of molecules")
plt.hist(rmsd, bins=20)
plt.show()
#include <iostream>
#include <openbabel/mol.h>
#include <openbabel/obconversion.h>
#include <openbabel/builder.h>
#include <openbabel/forcefield.h>
using namespace std;
using namespace OpenBabel;
int main(int argc, char **argv) {
OBMol mol;
OBBuilder builder;
ifstream ifs;
OBConversion conv;
conv.SetOutFormat("SDF");
for (int i=1; i < argc; i++) {
OBFormat *inFormat = conv.FormatFromExt(argv[i]);
if(inFormat==NULL || !conv.SetInFormat(inFormat))
{
cerr << " Cannot read file format for " << argv[i] << endl;
continue; // try next file
}
ifs.open(argv[i]);
if (!ifs) {
cerr << "Cannot read input file: " << argv[i] << endl;
continue;
}
while(ifs.peek() != EOF && ifs.good()) {
conv.Read(&mol, &ifs);
builder.Build(mol);
mol.AddHydrogens();
// Cleanup by MMFF
OBForceField* pFF = OBForceField::FindForceField("MMFF94");
if (!pFF)
continue;
if (!pFF->Setup(mol)) {
pFF = OBForceField::FindForceField("UFF");
if (!pFF || !pFF->Setup(mol)) continue; // can't use either MMFF94 or UFF
}
// Since we only want a rough geometry, use distance cutoffs for VDW, Electrostatics
pFF->EnableCutOff(true);
pFF->SetVDWCutOff(10.0);
pFF->SetElectrostaticCutOff(20.0);
pFF->SetUpdateFrequency(10); // update non-bonded distances infrequently
// How many cleanup cycles?
int iterations = 25;
// Initial cleanup for every level
//pFF->ConjugateGradients(iterations, 1.0e-4);
// permute central rotors
pFF->FastRotorSearch(false);
// Final cleanup and copy the new coordinates back
pFF->ConjugateGradients(iterations, 1.0e-6);
pFF->UpdateCoordinates(mol);
conv.Write(&mol, &cout);
}
}
}
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
w = Chem.SDWriter('platinum-rdkit-etkdg.sdf')
with open("platinum.smi", "r") as f:
for line in f:
smiles = line.split()[0]
mol = Chem.MolFromSmiles(smiles)
mol = Chem.AddHs(mol)
AllChem.EmbedMolecule(mol, AllChem.ETKDG())
w.write(mol)
w.flush()
FROM ubuntu:16.04
# COD version
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
git \
libcairo2-dev \
libeigen2-dev \
zlib1g-dev
RUN git clone https://github.com/n-yoshikawa/openbabel.git && \
cd openbabel/ && \
git checkout fragment-based-builder-cod && \
mkdir build && cd build/ && \
cmake .. && make -j ${nproc} && make install
FROM ubuntu:16.04
# Original version
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
libcairo2-dev \
libeigen2-dev \
wget \
zlib1g-dev
RUN wget https://github.com/openbabel/openbabel/archive/openbabel-2-4-1.tar.gz && \
tar xf openbabel-2-4-1.tar.gz && \
cd openbabel-openbabel-2-4-1/ && \
mkdir build && cd build/ && \
cmake .. && make -j ${nproc} && make install

Evaluation by platinum dataset using only COD data for fragment database https://github.com/n-yoshikawa/openbabel/tree/fragment-based-builder-cod

Method Time (s) mean RMSD mean bond error (%) mean angle error (%) fail
OpenBabel (COD) 5.1 2.08 6.77 4.24 408
OpenBabel (COD+Ring) 27.5 1.85 5.22 2.79 306
OpenBabel (Ring+COD) 49.9 1.78 4.38 2.12 312
OpenBabel (Ring+COD+light MMFF) 180.1 1.68 4.41 2.47 329
OpenBabel (COD+light MMFF) 130.5 1.89 6.44 4.88 454
OpenBabel (COD+Fast MMFF) 750.9 1.74 4.67 2.64 527
OpenBabel (Original) 76.5 1.78 4.10 2.13 1079
RDKit 161.9 1.74 --- --- ---
RDKit (ETKDG) 268.2 1.59 5.35 2.51 76

"fail" means mismatch between SMILES generated from original and predicted molecule. Total number of dataset is 4548.

histogram of RMSD

histogram of bond error

histogram of angle error

obabel platinum_dataset_2017_01.sdf -O platinum.smi
g++ -I /usr/local/include/openbabel-2.0/ -L /usr/local/lib/openbabel/2.4.90/ convert-openbabel.cpp -o convert -lopenbabel
./convert platinum.smi > platinum-openbabel.sdf
python calc_rmsd.py
python convert-rdkit.py
python calc_rmsd.py # need to change file name in code
g++ -I /usr/local/include/openbabel-2.0/ -L /usr/local/lib/openbabel/2.4.90/ evaluation_bond_angle.cpp -o eval -lopenbabel
./eval platinum_dataset_2017_01.sdf platinum-openbabel.sdf
#include <openbabel/isomorphism.h>
#include <openbabel/mol.h>
#include <openbabel/obconversion.h>
#include <openbabel/obiter.h>
#include <openbabel/query.h>
#include <cassert>
#include <cmath>
#include <iterator>
#include <iomanip>
#include <sstream>
#include <vector>
#include <map>
using namespace std;
using namespace OpenBabel;
int main(int argc, char **argv) {
OBConversion conv;
//conv.SetOptions("O", conv.OUTOPTIONS);
conv.SetInFormat("SDF");
conv.SetOutFormat("SMI");
OBFormat *inFormat;
if(argc != 3) {
cerr << argv[0] << "<original SDF>" << "<predicted SDF>" << endl;
}
int totalNum = 0;
int wrongNum = 0;
ifstream ifs1, ifs2;
ifs1.open(argv[1]);
ifs2.open(argv[2]);
while(ifs1.peek() != EOF && ifs1.good() &&
ifs2.peek() != EOF && ifs2.peek()) {
// mol1: ground truth molecule
// mol2: predicted molecule
OBMol mol1, mol2;
conv.Read(&mol1, &ifs1);
conv.Read(&mol2, &ifs2);
stringstream ss1, ss2;
string smiles1, smiles2;
// Make sure canonical SMILES of two molecules are the same
conv.Write(&mol1, &ss1);
conv.Write(&mol2, &ss2);
ss1 >> smiles1;
ss2 >> smiles2;
totalNum++;
if(smiles1 != smiles2) {
wrongNum++;
continue;
}
// Get mapping between two molecules
OBQuery *query = CompileMoleculeQuery(&mol1);
OBIsomorphismMapper *mapper = OBIsomorphismMapper::GetInstance(query);
OBIsomorphismMapper::Mappings mappings;
mapper->MapAll(&mol2, mappings);
map<int, int> idxMapping; // mol1 index to mol2 index
for(size_t m = 0; m < mappings.size(); ++m) {
bool isCorrect = true;
for(size_t i = 0; i < mappings[m].size(); ++i) {
//obmol indices are 1-indexed while the mapper is zero indexed
unsigned int idx1 = mappings[m][i].first + 1;
unsigned int idx2 = mappings[m][i].second + 1;
idxMapping[idx1] = idx2;
if(mol1.GetAtom(idx1)->GetAtomicNum() != mol2.GetAtom(idx2)->GetAtomicNum()) {
idxMapping.clear();
isCorrect = false;
break;
}
}
if(isCorrect) break;
}
assert(!idxMapping.empty());
// Display the length of each bond
double bondError = 0;
int bondNum = 0;
FOR_BONDS_OF_MOL(bond1, mol1) {
unsigned int begin1 = idxMapping[bond1->GetBeginAtomIdx()];
unsigned int end1 = idxMapping[bond1->GetEndAtomIdx()];
bondNum++;
bool isFound = false;
FOR_BONDS_OF_MOL(bond2, mol2) {
unsigned int begin2 = bond2->GetBeginAtomIdx();
unsigned int end2 = bond2->GetEndAtomIdx();
if((begin1 == begin2 && end1 == end2) || (begin1 == end2 && end1 == begin2)) {
bondError += abs(bond1->GetLength() - bond2->GetLength()) / bond1->GetLength() ;
isFound = true;
break;
}
}
assert(isFound);
}
bondError /= bondNum;
double angleError = 0;
int angleNum = 0;
FOR_ANGLES_OF_MOL(angle1, mol1) {
unsigned int idxA, idxB, idxC;
OBAtom *a1, *b1, *c1;
double angleDeg1;
b1 = mol1.GetAtom((*angle1)[0] + 1);
a1 = mol1.GetAtom((*angle1)[1] + 1);
c1 = mol1.GetAtom((*angle1)[2] + 1);
idxA = idxMapping[a1->GetIdx()];
idxB = idxMapping[b1->GetIdx()];
idxC = idxMapping[c1->GetIdx()];
angleDeg1 = a1->GetAngle(b1->GetIdx(), c1->GetIdx());
angleNum++;
bool isFound = false;
FOR_ANGLES_OF_MOL(angle2, mol2) {
OBAtom *a2, *b2, *c2;
b2 = mol2.GetAtom((*angle2)[0] + 1);
a2 = mol2.GetAtom((*angle2)[1] + 1);
c2 = mol2.GetAtom((*angle2)[2] + 1);
if(idxB == b2->GetIdx() && ((idxA == a2->GetIdx() && idxC == c2->GetIdx()) ||
(idxA == c2->GetIdx() && idxC == a2->GetIdx()))) {
double angleDeg2 = a2->GetAngle(b2->GetIdx(), c2->GetIdx());
angleError += abs(angleDeg1 - angleDeg2) / angleDeg1;
isFound = true;
break;
}
}
assert(isFound);
}
angleError /= angleNum;
cout << smiles1 << "," << setprecision(10) << bondError << "," << angleError << endl;
}
cout << "total: " << totalNum << "," << "wrong:" << wrongNum << endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment