Created
April 17, 2019 10:08
-
-
Save ergo70/22472c0225a9721709194f6da8048706 to your computer and use it in GitHub Desktop.
How to screen for chemical similarity using binary fingerprints
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.theplateisbad.fp; | |
import java.io.IOException; | |
import java.sql.Connection; | |
import java.sql.DriverManager; | |
import java.sql.PreparedStatement; | |
import java.sql.ResultSet; | |
import java.sql.SQLException; | |
import java.sql.Types; | |
import java.util.BitSet; | |
import java.util.LinkedList; | |
import java.util.Scanner; | |
import org.openscience.cdk.DefaultChemObjectBuilder; | |
import org.openscience.cdk.aromaticity.Aromaticity; | |
import org.openscience.cdk.exception.CDKException; | |
import org.openscience.cdk.fingerprint.IBitFingerprint; | |
import org.openscience.cdk.fingerprint.PubchemFingerprinter; | |
import org.openscience.cdk.interfaces.IAtomContainer; | |
import org.openscience.cdk.smiles.SmilesParser; | |
import org.openscience.cdk.tools.CDKHydrogenAdder; | |
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; | |
import java.util.List; | |
public class FindBySimilarity { | |
static { | |
try { | |
Class.forName("org.postgresql.Driver"); | |
} catch (ClassNotFoundException e) { | |
e.printStackTrace(); | |
} | |
} | |
public static void main(String[] args) throws CDKException, CloneNotSupportedException, IOException, SQLException { | |
try (Scanner scan = new Scanner(System.in)) { | |
while (true) { | |
System.out.println("Enter SMILES:"); | |
String SMILES = scan.next(); | |
System.out.println("Enter >= threshold:"); | |
float threshold = scan.nextFloat(); | |
System.out.println("Enter top N:"); | |
int topN = scan.nextInt(); | |
System.out.println("\nPos.\tTanimoto\tTarget"); | |
System.out.println("--------------------------------------------------------------------------------"); | |
for (String result : find(SMILES, threshold, topN)) { | |
System.out.println(result); | |
} | |
System.out.println(); | |
} | |
} | |
} | |
private static List<String> find(final String SMILES, final float threshold, final int topN) | |
throws SQLException, CDKException { | |
String url = "jdbc:postgresql://localhost/chemistry?user=postgres&password="; | |
List<String> res = new LinkedList<String>(); | |
try (Connection conn = DriverManager.getConnection(url); | |
PreparedStatement query = conn.prepareStatement( | |
"SELECT id, smiles, cdk.tanimoto(?::bit varying, pubchemfp) AS tanimoto FROM cdk.externalfp WHERE cdk.tanimoto_c(?::bit varying, pubchemfp) >= ? AND cardinality BETWEEN ? AND ? ORDER BY tanimoto DESC FETCH FIRST ? ROWS ONLY", | |
ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)) { | |
conn.setAutoCommit(false); | |
IBitFingerprint fp = fingerprint(SMILES); | |
//int[] lohi = swamidassBaldiLimitsForTanimoto(fp.cardinality(), threshold); | |
int[] lohi = {0, Integer.MAX_VALUE}; | |
String bitstring = fingerprint2bitString(fp); | |
query.setString(1, bitstring); | |
query.setString(2, bitstring); | |
query.setFloat(3, threshold); | |
query.setInt(4, lohi[0]); | |
query.setInt(5, lohi[1]); | |
if (topN < 1) { | |
query.setNull(6, Types.BIGINT); | |
query.setFetchSize(1000); | |
} else { | |
query.setInt(6, topN); | |
query.setFetchSize(topN); | |
} | |
long start = System.currentTimeMillis(); | |
ResultSet result = query.executeQuery(); | |
long end = System.currentTimeMillis(); | |
int pos = 1; | |
while (result.next()) { | |
res.add(pos++ + "\t" + String.format("%f", result.getFloat(3)) + "\t" + result.getInt(1) + ":" | |
+ result.getString(2)); | |
} | |
res.add("Found in " + (end - start) / 1000f + " s"); | |
result.close(); | |
conn.setAutoCommit(true); | |
} | |
return res; | |
} | |
private static IBitFingerprint fingerprint(final String SMILES) throws CDKException { | |
CDKHydrogenAdder adder = CDKHydrogenAdder.getInstance(DefaultChemObjectBuilder.getInstance()); | |
SmilesParser smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance()); | |
PubchemFingerprinter fpr = new PubchemFingerprinter(DefaultChemObjectBuilder.getInstance()); | |
IAtomContainer mol = smilesParser.parseSmiles(SMILES); | |
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol); | |
adder.addImplicitHydrogens(mol); | |
AtomContainerManipulator.convertImplicitToExplicitHydrogens(mol); | |
Aromaticity.cdkLegacy().apply(mol); | |
return fpr.getBitFingerprint(mol); | |
} | |
/* | |
* "Bounds and algorithms for fast exact searches of chemical fingerprints in | |
* linear and sublinear time." Swamidass S.J., Baldi P. J Chem Inf Model. 2007 | |
* Mar-Apr;47(2):302-17. Epub 2007 Feb 28. | |
* https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2527184/ | |
*/ | |
private static int[] swamidassBaldiLimitsForTanimoto(final int cardinality, final double threshold) { | |
int[] lohi = { 0, Integer.MAX_VALUE }; | |
lohi[0] = (int) Math.floor(cardinality * threshold); | |
lohi[1] = (int) Math.ceil(cardinality / threshold); | |
return lohi; | |
} | |
private static String fingerprint2bitString(final IBitFingerprint fp) { | |
StringBuilder res = new StringBuilder(); | |
BitSet fpBits = fp.asBitSet(); | |
for (long i = 0; i < fp.size(); i++) { | |
if (true == fpBits.get((int) i)) { | |
res.append('1'); | |
} else { | |
res.append('0'); | |
} | |
} | |
return res.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment