Skip to content

Instantly share code, notes, and snippets.

@sshojiro
Created October 29, 2018 11:53
Show Gist options
  • Save sshojiro/8b4d796bf11d9f13f51e4fdc66615255 to your computer and use it in GitHub Desktop.
Save sshojiro/8b4d796bf11d9f13f51e4fdc66615255 to your computer and use it in GitHub Desktop.
How hashed Morgan fingerprint works
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Morgan fingerprint\n",
"\n",
"Note that you need to take special care of how Morgan fingerprints are generated.\n",
"\n",
"Morgan fingerprint usually generates a vector in a random way, as the note shows.\n",
"\n",
"Therefore, assignment of fragments onto bits depends on molecules."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# import numpy as np\n",
"from rdkit import Chem\n",
"from rdkit.Chem.AllChem import GetHashedMorganFingerprint, GetMorganFingerprint\n",
"from IPython.display import SVG\n",
"from rdkit.Chem import rdDepictor\n",
"from rdkit.Chem.Draw import rdMolDraw2D"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# http://rdkit.blogspot.com/2016/02/morgan-fingerprint-bit-statistics.html\n",
"\n",
"def _prepareMol(mol,kekulize):\n",
" mc = Chem.Mol(mol.ToBinary())\n",
" if kekulize:\n",
" try:\n",
" Chem.Kekulize(mc)\n",
" except:\n",
" mc = Chem.Mol(mol.ToBinary())\n",
" if not mc.GetNumConformers():\n",
" rdDepictor.Compute2DCoords(mc)\n",
" return mc\n",
"def moltosvg(mol,molSize=(450,200),kekulize=True,drawer=None,**kwargs):\n",
" mc = _prepareMol(mol,kekulize)\n",
" if drawer is None:\n",
" drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])\n",
" drawer.DrawMolecule(mc,**kwargs)\n",
" drawer.FinishDrawing()\n",
" svg = drawer.GetDrawingText()\n",
" # It seems that the svg renderer used doesn't quite hit the spec.\n",
" # Here are some fixes to make it work in the notebook, although I think\n",
" # the underlying issue needs to be resolved at the generation step\n",
" return SVG(svg.replace('svg:',''))\n",
"def getSubstructDepiction(mol,atomID,radius,molSize=(450,200)):\n",
" if radius>0:\n",
" env = Chem.FindAtomEnvironmentOfRadiusN(mol,radius,atomID)\n",
" atomsToUse=[]\n",
" for b in env:\n",
" atomsToUse.append(mol.GetBondWithIdx(b).GetBeginAtomIdx())\n",
" atomsToUse.append(mol.GetBondWithIdx(b).GetEndAtomIdx())\n",
" atomsToUse = list(set(atomsToUse))\n",
" else:\n",
" atomsToUse = [atomID]\n",
" env=None\n",
" return moltosvg(mol,molSize=molSize,highlightAtoms=atomsToUse,highlightAtomColors={atomID:(0.3,0.3,1)})\n",
"\n",
"def get_envs_from_hashed_fp(m, radius=3, n_bits=2048, show_svg=False):\n",
" bInfo={}\n",
" dict_submols={}\n",
" mfp = GetHashedMorganFingerprint(m, radius, nBits=n_bits, bitInfo=bInfo)\n",
" for index_submol, v in bInfo.items():\n",
" if len(np.unique([c[1] for c in v])) == 1:\n",
" dict_submols.update({index_submol:\n",
" (m, v[0][::-1])\n",
" })\n",
" else:\n",
" list_radius = np.unique([c[1] for c in v])\n",
" if len( list_radius ) > 1:\n",
" for cc in list_radius:\n",
" for c in v:\n",
" if c[1] == cc:\n",
" dict_submols.update({\n",
" '{}-{}'.format(index_submol,c[1]):\n",
" (m, c[::-1])\n",
" })\n",
" break\n",
" return dict_submols\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"bitInfo={}\n",
"mols = [Chem.MolFromSmiles('c1ccccc1'),\n",
" Chem.MolFromSmiles('CC(C)CC1=CC=C(C=C1)C(C)C(O)=O')]\n",
"svg = [None] * len(mols)\n",
"bitInfo = [None] * len(mols)\n",
"\n",
"for idx, mol in enumerate(mols):\n",
" bitInfo[idx] = {}\n",
" mhfp = GetHashedMorganFingerprint(mol, 3, 2048, bitInfo=bitInfo[idx])\n",
" svg[idx] = {}\n",
" for k, val in bitInfo[idx].items():\n",
" svg[idx][k] = getSubstructDepiction(mol, *bitInfo[idx][k][0],\n",
" molSize=(300, 300))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"389-th fragment in benezene\n"
]
},
{
"data": {
"image/svg+xml": [
"<svg baseProfile=\"full\" height=\"300px\" version=\"1.1\" width=\"300px\" xml:space=\"preserve\" xmlns:rdkit=\"http://www.rdkit.org/xml\" xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<rect height=\"300\" style=\"opacity:1.0;fill:#FFFFFF;stroke:none\" width=\"300\" x=\"0\" y=\"0\"> </rect>\n",
"<path d=\"M 286.364,150 218.182,268.094\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:16px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 218.182,268.094 81.8182,268.094\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:16px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 81.8182,268.094 13.6364,150\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:16px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 13.6364,150 81.8182,31.9056\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:16px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<ellipse cx=\"286.364\" cy=\"150\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<ellipse cx=\"218.182\" cy=\"268.094\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<ellipse cx=\"81.8182\" cy=\"268.094\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#4C4CFF;fill-rule:evenodd;stroke:#4C4CFF;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<ellipse cx=\"13.6364\" cy=\"150\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<ellipse cx=\"81.8182\" cy=\"31.9056\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 286.364,150 218.182,268.094\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 252.517,154.078 204.79,236.744\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 286.364,150 218.182,31.9056\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 218.182,268.094 81.8182,268.094\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 81.8182,268.094 13.6364,150\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 95.2098,236.744 47.4825,154.078\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 13.6364,150 81.8182,31.9056\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 81.8182,31.9056 218.182,31.9056\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 102.273,59.1784 197.727,59.1784\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.SVG object>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('389-th fragment in benezene')\n",
"svg[0][389]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"389-th fragment in ibuprofen\n"
]
},
{
"data": {
"image/svg+xml": [
"<svg baseProfile=\"full\" height=\"300px\" version=\"1.1\" width=\"300px\" xml:space=\"preserve\" xmlns:rdkit=\"http://www.rdkit.org/xml\" xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<rect height=\"300\" style=\"opacity:1.0;fill:#FFFFFF;stroke:none\" width=\"300\" x=\"0\" y=\"0\"> </rect>\n",
"<path d=\"M 228.909,130.61 264.787,128.407\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:15px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<ellipse cx=\"228.909\" cy=\"130.61\" rx=\"9.58566\" ry=\"9.58566\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<ellipse cx=\"264.787\" cy=\"128.407\" rx=\"9.58566\" ry=\"9.58566\" style=\"fill:#4C4CFF;fill-rule:evenodd;stroke:#4C4CFF;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 13.6364,143.825 49.5151,141.622\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 49.5151,141.622 65.547,109.449\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 49.5151,141.622 69.3618,171.593\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 69.3618,171.593 105.241,169.39\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 105.241,169.39 121.272,137.217\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 114.08,167.771 125.302,145.25\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 105.241,169.39 125.087,199.361\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 121.272,137.217 157.151,135.015\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 157.151,135.015 176.998,164.985\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 154.134,143.48 168.027,164.459\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 176.998,164.985 160.966,197.158\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 176.998,164.985 212.877,162.783\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 160.966,197.158 125.087,199.361\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 155.144,190.313 130.029,191.855\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 212.877,162.783 232.723,192.753\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 212.877,162.783 228.909,130.61\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 228.909,130.61 241.454,129.84\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 241.454,129.84 253.999,129.07\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 231.906,128.625 223.966,116.635\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 223.966,116.635 216.026,104.645\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 225.912,132.594 217.972,120.605\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<path d=\"M 217.972,120.605 210.032,108.615\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n",
"<text style=\"font-size:11px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\" x=\"253.999\" y=\"134.398\"><tspan>OH</tspan></text>\n",
"<text style=\"font-size:11px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\" x=\"203.466\" y=\"106.63\"><tspan>O</tspan></text>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.SVG object>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('389-th fragment in ibuprofen')\n",
"svg[1][389]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@sshojiro
Copy link
Author

sshojiro commented Oct 29, 2018

Results are here:

Benzene-389

Benzene-389

Ibuprofen-389

Ibuprofen-389

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment