Skip to content

Instantly share code, notes, and snippets.

@ergo70
Created April 17, 2019 10:08
Show Gist options
  • Save ergo70/22472c0225a9721709194f6da8048706 to your computer and use it in GitHub Desktop.
Save ergo70/22472c0225a9721709194f6da8048706 to your computer and use it in GitHub Desktop.
How to screen for chemical similarity using binary fingerprints
package org.theplateisbad.fp;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Types;
import java.util.BitSet;
import java.util.LinkedList;
import java.util.Scanner;
import org.openscience.cdk.DefaultChemObjectBuilder;
import org.openscience.cdk.aromaticity.Aromaticity;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.fingerprint.IBitFingerprint;
import org.openscience.cdk.fingerprint.PubchemFingerprinter;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.smiles.SmilesParser;
import org.openscience.cdk.tools.CDKHydrogenAdder;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
import java.util.List;
public class FindBySimilarity {
static {
try {
Class.forName("org.postgresql.Driver");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws CDKException, CloneNotSupportedException, IOException, SQLException {
try (Scanner scan = new Scanner(System.in)) {
while (true) {
System.out.println("Enter SMILES:");
String SMILES = scan.next();
System.out.println("Enter >= threshold:");
float threshold = scan.nextFloat();
System.out.println("Enter top N:");
int topN = scan.nextInt();
System.out.println("\nPos.\tTanimoto\tTarget");
System.out.println("--------------------------------------------------------------------------------");
for (String result : find(SMILES, threshold, topN)) {
System.out.println(result);
}
System.out.println();
}
}
}
private static List<String> find(final String SMILES, final float threshold, final int topN)
throws SQLException, CDKException {
String url = "jdbc:postgresql://localhost/chemistry?user=postgres&password=";
List<String> res = new LinkedList<String>();
try (Connection conn = DriverManager.getConnection(url);
PreparedStatement query = conn.prepareStatement(
"SELECT id, smiles, cdk.tanimoto(?::bit varying, pubchemfp) AS tanimoto FROM cdk.externalfp WHERE cdk.tanimoto_c(?::bit varying, pubchemfp) >= ? AND cardinality BETWEEN ? AND ? ORDER BY tanimoto DESC FETCH FIRST ? ROWS ONLY",
ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)) {
conn.setAutoCommit(false);
IBitFingerprint fp = fingerprint(SMILES);
//int[] lohi = swamidassBaldiLimitsForTanimoto(fp.cardinality(), threshold);
int[] lohi = {0, Integer.MAX_VALUE};
String bitstring = fingerprint2bitString(fp);
query.setString(1, bitstring);
query.setString(2, bitstring);
query.setFloat(3, threshold);
query.setInt(4, lohi[0]);
query.setInt(5, lohi[1]);
if (topN < 1) {
query.setNull(6, Types.BIGINT);
query.setFetchSize(1000);
} else {
query.setInt(6, topN);
query.setFetchSize(topN);
}
long start = System.currentTimeMillis();
ResultSet result = query.executeQuery();
long end = System.currentTimeMillis();
int pos = 1;
while (result.next()) {
res.add(pos++ + "\t" + String.format("%f", result.getFloat(3)) + "\t" + result.getInt(1) + ":"
+ result.getString(2));
}
res.add("Found in " + (end - start) / 1000f + " s");
result.close();
conn.setAutoCommit(true);
}
return res;
}
private static IBitFingerprint fingerprint(final String SMILES) throws CDKException {
CDKHydrogenAdder adder = CDKHydrogenAdder.getInstance(DefaultChemObjectBuilder.getInstance());
SmilesParser smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance());
PubchemFingerprinter fpr = new PubchemFingerprinter(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smilesParser.parseSmiles(SMILES);
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
adder.addImplicitHydrogens(mol);
AtomContainerManipulator.convertImplicitToExplicitHydrogens(mol);
Aromaticity.cdkLegacy().apply(mol);
return fpr.getBitFingerprint(mol);
}
/*
* "Bounds and algorithms for fast exact searches of chemical fingerprints in
* linear and sublinear time." Swamidass S.J., Baldi P. J Chem Inf Model. 2007
* Mar-Apr;47(2):302-17. Epub 2007 Feb 28.
* https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2527184/
*/
private static int[] swamidassBaldiLimitsForTanimoto(final int cardinality, final double threshold) {
int[] lohi = { 0, Integer.MAX_VALUE };
lohi[0] = (int) Math.floor(cardinality * threshold);
lohi[1] = (int) Math.ceil(cardinality / threshold);
return lohi;
}
private static String fingerprint2bitString(final IBitFingerprint fp) {
StringBuilder res = new StringBuilder();
BitSet fpBits = fp.asBitSet();
for (long i = 0; i < fp.size(); i++) {
if (true == fpBits.get((int) i)) {
res.append('1');
} else {
res.append('0');
}
}
return res.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment