Created
October 29, 2018 11:53
-
-
Save sshojiro/8b4d796bf11d9f13f51e4fdc66615255 to your computer and use it in GitHub Desktop.
How hashed Morgan fingerprint works
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Morgan fingerprint\n", | |
"\n", | |
"Note that you need to take special care of how Morgan fingerprints are generated.\n", | |
"\n", | |
"Morgan fingerprint usually generates a vector in a random way, as the note shows.\n", | |
"\n", | |
"Therefore, assignment of fragments onto bits depends on molecules." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# import numpy as np\n", | |
"from rdkit import Chem\n", | |
"from rdkit.Chem.AllChem import GetHashedMorganFingerprint, GetMorganFingerprint\n", | |
"from IPython.display import SVG\n", | |
"from rdkit.Chem import rdDepictor\n", | |
"from rdkit.Chem.Draw import rdMolDraw2D" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# http://rdkit.blogspot.com/2016/02/morgan-fingerprint-bit-statistics.html\n", | |
"\n", | |
"def _prepareMol(mol,kekulize):\n", | |
" mc = Chem.Mol(mol.ToBinary())\n", | |
" if kekulize:\n", | |
" try:\n", | |
" Chem.Kekulize(mc)\n", | |
" except:\n", | |
" mc = Chem.Mol(mol.ToBinary())\n", | |
" if not mc.GetNumConformers():\n", | |
" rdDepictor.Compute2DCoords(mc)\n", | |
" return mc\n", | |
"def moltosvg(mol,molSize=(450,200),kekulize=True,drawer=None,**kwargs):\n", | |
" mc = _prepareMol(mol,kekulize)\n", | |
" if drawer is None:\n", | |
" drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])\n", | |
" drawer.DrawMolecule(mc,**kwargs)\n", | |
" drawer.FinishDrawing()\n", | |
" svg = drawer.GetDrawingText()\n", | |
" # It seems that the svg renderer used doesn't quite hit the spec.\n", | |
" # Here are some fixes to make it work in the notebook, although I think\n", | |
" # the underlying issue needs to be resolved at the generation step\n", | |
" return SVG(svg.replace('svg:',''))\n", | |
"def getSubstructDepiction(mol,atomID,radius,molSize=(450,200)):\n", | |
" if radius>0:\n", | |
" env = Chem.FindAtomEnvironmentOfRadiusN(mol,radius,atomID)\n", | |
" atomsToUse=[]\n", | |
" for b in env:\n", | |
" atomsToUse.append(mol.GetBondWithIdx(b).GetBeginAtomIdx())\n", | |
" atomsToUse.append(mol.GetBondWithIdx(b).GetEndAtomIdx())\n", | |
" atomsToUse = list(set(atomsToUse))\n", | |
" else:\n", | |
" atomsToUse = [atomID]\n", | |
" env=None\n", | |
" return moltosvg(mol,molSize=molSize,highlightAtoms=atomsToUse,highlightAtomColors={atomID:(0.3,0.3,1)})\n", | |
"\n", | |
"def get_envs_from_hashed_fp(m, radius=3, n_bits=2048, show_svg=False):\n", | |
" bInfo={}\n", | |
" dict_submols={}\n", | |
" mfp = GetHashedMorganFingerprint(m, radius, nBits=n_bits, bitInfo=bInfo)\n", | |
" for index_submol, v in bInfo.items():\n", | |
" if len(np.unique([c[1] for c in v])) == 1:\n", | |
" dict_submols.update({index_submol:\n", | |
" (m, v[0][::-1])\n", | |
" })\n", | |
" else:\n", | |
" list_radius = np.unique([c[1] for c in v])\n", | |
" if len( list_radius ) > 1:\n", | |
" for cc in list_radius:\n", | |
" for c in v:\n", | |
" if c[1] == cc:\n", | |
" dict_submols.update({\n", | |
" '{}-{}'.format(index_submol,c[1]):\n", | |
" (m, c[::-1])\n", | |
" })\n", | |
" break\n", | |
" return dict_submols\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"bitInfo={}\n", | |
"mols = [Chem.MolFromSmiles('c1ccccc1'),\n", | |
" Chem.MolFromSmiles('CC(C)CC1=CC=C(C=C1)C(C)C(O)=O')]\n", | |
"svg = [None] * len(mols)\n", | |
"bitInfo = [None] * len(mols)\n", | |
"\n", | |
"for idx, mol in enumerate(mols):\n", | |
" bitInfo[idx] = {}\n", | |
" mhfp = GetHashedMorganFingerprint(mol, 3, 2048, bitInfo=bitInfo[idx])\n", | |
" svg[idx] = {}\n", | |
" for k, val in bitInfo[idx].items():\n", | |
" svg[idx][k] = getSubstructDepiction(mol, *bitInfo[idx][k][0],\n", | |
" molSize=(300, 300))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"389-th fragment in benezene\n" | |
] | |
}, | |
{ | |
"data": { | |
"image/svg+xml": [ | |
"<svg baseProfile=\"full\" height=\"300px\" version=\"1.1\" width=\"300px\" xml:space=\"preserve\" xmlns:rdkit=\"http://www.rdkit.org/xml\" xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", | |
"<rect height=\"300\" style=\"opacity:1.0;fill:#FFFFFF;stroke:none\" width=\"300\" x=\"0\" y=\"0\"> </rect>\n", | |
"<path d=\"M 286.364,150 218.182,268.094\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:16px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 218.182,268.094 81.8182,268.094\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:16px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 81.8182,268.094 13.6364,150\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:16px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 13.6364,150 81.8182,31.9056\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:16px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<ellipse cx=\"286.364\" cy=\"150\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<ellipse cx=\"218.182\" cy=\"268.094\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<ellipse cx=\"81.8182\" cy=\"268.094\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#4C4CFF;fill-rule:evenodd;stroke:#4C4CFF;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<ellipse cx=\"13.6364\" cy=\"150\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<ellipse cx=\"81.8182\" cy=\"31.9056\" rx=\"36.3636\" ry=\"36.3636\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 286.364,150 218.182,268.094\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 252.517,154.078 204.79,236.744\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 286.364,150 218.182,31.9056\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 218.182,268.094 81.8182,268.094\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 81.8182,268.094 13.6364,150\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 95.2098,236.744 47.4825,154.078\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 13.6364,150 81.8182,31.9056\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 81.8182,31.9056 218.182,31.9056\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 102.273,59.1784 197.727,59.1784\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"</svg>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.SVG object>" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"print('389-th fragment in benezene')\n", | |
"svg[0][389]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"389-th fragment in ibuprofen\n" | |
] | |
}, | |
{ | |
"data": { | |
"image/svg+xml": [ | |
"<svg baseProfile=\"full\" height=\"300px\" version=\"1.1\" width=\"300px\" xml:space=\"preserve\" xmlns:rdkit=\"http://www.rdkit.org/xml\" xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", | |
"<rect height=\"300\" style=\"opacity:1.0;fill:#FFFFFF;stroke:none\" width=\"300\" x=\"0\" y=\"0\"> </rect>\n", | |
"<path d=\"M 228.909,130.61 264.787,128.407\" style=\"fill:none;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:15px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<ellipse cx=\"228.909\" cy=\"130.61\" rx=\"9.58566\" ry=\"9.58566\" style=\"fill:#FF7F7F;fill-rule:evenodd;stroke:#FF7F7F;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<ellipse cx=\"264.787\" cy=\"128.407\" rx=\"9.58566\" ry=\"9.58566\" style=\"fill:#4C4CFF;fill-rule:evenodd;stroke:#4C4CFF;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 13.6364,143.825 49.5151,141.622\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 49.5151,141.622 65.547,109.449\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 49.5151,141.622 69.3618,171.593\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 69.3618,171.593 105.241,169.39\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 105.241,169.39 121.272,137.217\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 114.08,167.771 125.302,145.25\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 105.241,169.39 125.087,199.361\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 121.272,137.217 157.151,135.015\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 157.151,135.015 176.998,164.985\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 154.134,143.48 168.027,164.459\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 176.998,164.985 160.966,197.158\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 176.998,164.985 212.877,162.783\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 160.966,197.158 125.087,199.361\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 155.144,190.313 130.029,191.855\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 212.877,162.783 232.723,192.753\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 212.877,162.783 228.909,130.61\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 228.909,130.61 241.454,129.84\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 241.454,129.84 253.999,129.07\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 231.906,128.625 223.966,116.635\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 223.966,116.635 216.026,104.645\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 225.912,132.594 217.972,120.605\" style=\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<path d=\"M 217.972,120.605 210.032,108.615\" style=\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\"/>\n", | |
"<text style=\"font-size:11px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\" x=\"253.999\" y=\"134.398\"><tspan>OH</tspan></text>\n", | |
"<text style=\"font-size:11px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\" x=\"203.466\" y=\"106.63\"><tspan>O</tspan></text>\n", | |
"</svg>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.SVG object>" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"print('389-th fragment in ibuprofen')\n", | |
"svg[1][389]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Results are here:
Benzene-389
Ibuprofen-389