Last active
September 30, 2020 23:35
-
-
Save wutobias/3499867692f2c7d8c4c2232327227882 to your computer and use it in GitHub Desktop.
Example for making a diverse subset selection from QC molecules with common substructures
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### What is this about?\n", | |
"Here we want to get a set of molecules from QCArchive that is diverse with respect to specific chemical environments. First we download molecules from QCArchive and then compute a fingerprint for a set of substructure matches. Then we cluster the fingerprints using Tanimoto similarity.\n", | |
"\n", | |
"Note, you need packages qcportal and rdkit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import qcportal as ptl\n", | |
"from rdkit.Chem import AllChem as Chem\n", | |
"from rdkit.Chem import Draw\n", | |
"from rdkit.Chem.Fingerprints import FingerprintMols\n", | |
"from rdkit import DataStructs\n", | |
"from rdkit.ML.Cluster import Butina" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>tagline</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>collection</th>\n", | |
" <th>name</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"22\" valign=\"top\">OptimizationDataset</th>\n", | |
" <th>FDA Optimization Dataset 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>JGI Metabolite Set 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Kinase Inhibitors: WBO Distributions</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Discrepancy Benchmark 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Ehrman Informative Optimization v0.1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Ehrman Informative Optimization v0.2</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Full Optimization Benchmark 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Gen 2 Opt Set 1 Roche</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Gen 2 Opt Set 2 Coverage</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Gen 2 Opt Set 5 Bayer</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF NCI250K Boron 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Optimization Set 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Primary Optimization Benchmark 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Protein Fragments v1.0</th>\n", | |
" <td>Constrained optimization of various protein fr...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF Protein Fragments v2.0</th>\n", | |
" <td>Constrained optimization of various protein fr...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>OpenFF VEHICLe Set 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>PEI</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Pfizer Discrepancy Optimization Dataset 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>QM8-T</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>SMIRNOFF Coverage Set 1</th>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" tagline\n", | |
"collection name \n", | |
"OptimizationDataset FDA Optimization Dataset 1 None\n", | |
" JGI Metabolite Set 1 None\n", | |
" Kinase Inhibitors: WBO Distributions None\n", | |
" OpenFF Discrepancy Benchmark 1 None\n", | |
" OpenFF Ehrman Informative Optimization v0.1 None\n", | |
" OpenFF Ehrman Informative Optimization v0.2 None\n", | |
" OpenFF Full Optimization Benchmark 1 None\n", | |
" OpenFF Gen 2 Opt Set 1 Roche None\n", | |
" OpenFF Gen 2 Opt Set 2 Coverage None\n", | |
" OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy None\n", | |
" OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy None\n", | |
" OpenFF Gen 2 Opt Set 5 Bayer None\n", | |
" OpenFF NCI250K Boron 1 None\n", | |
" OpenFF Optimization Set 1 None\n", | |
" OpenFF Primary Optimization Benchmark 1 None\n", | |
" OpenFF Protein Fragments v1.0 Constrained optimization of various protein fr...\n", | |
" OpenFF Protein Fragments v2.0 Constrained optimization of various protein fr...\n", | |
" OpenFF VEHICLe Set 1 None\n", | |
" PEI None\n", | |
" Pfizer Discrepancy Optimization Dataset 1 None\n", | |
" QM8-T None\n", | |
" SMIRNOFF Coverage Set 1 None" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client = ptl.FractalClient(\"https://api.qcarchive.molssi.org:443/\")\n", | |
"client.list_collections(\"OptimizationDataset\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Define some high-level parameters\n", | |
"maxHeavyAtoms: Maximum number of heavy numbers in a molecule\n", | |
"\n", | |
"maxLength : Maximum topological distance between atoms in atom centered atom pair fingerprint\n", | |
"\n", | |
"cutoff : maximum distance (i.e. 1-similarity) between chemical environments to be considered in the same cluster\n", | |
"\n", | |
"include_queries : (key,value) pairs with (name of the chemical environment, SMARTS encoding) for specifying the query chemical environments" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"maxHeavyAtoms = 20\n", | |
"maxLength = 2\n", | |
"cutoff = 0.3\n", | |
"\n", | |
"include_queries = {\n", | |
" \"C-N-C\" : Chem.MolFromSmarts(\"[#6:1]-[#7:2]-[#6:3]\"),\n", | |
"# \"N-N-C\" : Chem.MolFromSmarts(\"[#7:1]-[#7:2]-[#6:3]\"),\n", | |
" \"C-O-C\" : Chem.MolFromSmarts(\"[#6:1]-[#8:2]-[#6:3]\"),\n", | |
"# \"O-O-C\" : Chem.MolFromSmarts(\"[#8:1]-[#8:2]-[#6:3]\")\n", | |
"}\n", | |
"\n", | |
"#include_elements = [6, 8, 1]\n", | |
"include_elements = [6, 8, 7, 1]\n", | |
"\n", | |
"### These are all optimization datasets in QCA \n", | |
"query_sets = client.list_collections(\"OptimizationDataset\", aslist=True)\n", | |
"\n", | |
"### These are just the ones for FF optimization\n", | |
"#query_sets = [\n", | |
"# \"FDA Optimization Dataset 1\",\n", | |
"# \"Pfizer Discrepancy Optimization Dataset 1\",\n", | |
"# \"SMIRNOFF Coverage Set 1\",\n", | |
"# \"OpenFF Gen 2 Opt Set 1 Roche\",\n", | |
"# \"OpenFF Gen 2 Opt Set 2 Coverage\",\n", | |
"# \"OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy\",\n", | |
"# \"OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy\",\n", | |
"# \"OpenFF Gen 2 Opt Set 5 Bayer\"] " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Cluster chemical environments according to their atom center atom pair fingerprint\n", | |
"\n", | |
"Note that molecules may occure more than once if they a given query chemical environment shows up in a molecule multiple times." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/tobias/progs/anaconda3/envs/psi4/lib/python3.7/site-packages/rdkit/Chem/Draw/IPythonConsole.py:188: UserWarning: Truncating the list of molecules to be displayed to 50. Change the maxMols value to display more.\n", | |
" % (maxMols))\n" | |
] | |
} | |
], | |
"source": [ | |
"def get_smi(fwd_index):\n", | |
" \n", | |
" bkw_index = fwd_index[::-1]\n", | |
" smi = \"\"\n", | |
" found_start = False\n", | |
" for s in bkw_index:\n", | |
" if not found_start and s == \"-\":\n", | |
" found_start = True\n", | |
" continue\n", | |
" if found_start:\n", | |
" smi += s\n", | |
" smi = smi[::-1]\n", | |
" return smi\n", | |
"\n", | |
"clustcsv_fopen = open(\"./InterestingMols.csv\", \"w\")\n", | |
"allcsv_fopen = open(\"./AllMols.csv\", \"w\")\n", | |
"\n", | |
"clustcsv_fopen.write(\"#SubstructureQuery;DataSet;Cluster;Index;Smiles\\n\")\n", | |
"allcsv_fopen.write(\"#SubstructureQuery;DataSet;Cluster;Index;Smiles\\n\")\n", | |
"\n", | |
"for substruc_name in include_queries.keys():\n", | |
" smi_list = list()\n", | |
" mol_list = list()\n", | |
" ds_list = list()\n", | |
" index_list = list()\n", | |
" fp_list = list()\n", | |
" dist_list = list()\n", | |
" for ds_name in query_sets:\n", | |
" if ds_name.startswith(\"OpenFF Protein Fragments\"):\n", | |
" continue\n", | |
" ds = client.get_collection('OptimizationDataset', ds_name)\n", | |
" for index in ds.df.index:\n", | |
" smi = get_smi(index)\n", | |
" mol = Chem.MolFromSmiles(smi)\n", | |
" if mol == None:\n", | |
" continue\n", | |
" if mol.GetNumHeavyAtoms() > maxHeavyAtoms:\n", | |
" continue\n", | |
" if not mol.HasSubstructMatch(include_queries[substruc_name]):\n", | |
" continue\n", | |
" can_smi = Chem.MolToSmiles(mol, True)\n", | |
" elements_ok = True\n", | |
" for atm in mol.GetAtoms():\n", | |
" if not atm.GetAtomicNum() in include_elements:\n", | |
" elements_ok = False\n", | |
" break\n", | |
" if not elements_ok:\n", | |
" continue\n", | |
" if can_smi not in smi_list:\n", | |
" matches = mol.GetSubstructMatches(include_queries[substruc_name])\n", | |
" for match in matches:\n", | |
" fp = Chem.GetHashedAtomPairFingerprintAsBitVect(mol, \n", | |
" maxLength=maxLength, \n", | |
" fromAtoms=list(match))\n", | |
" smi_list.append(can_smi)\n", | |
" mol_list.append(mol)\n", | |
" index_list.append(index)\n", | |
" ds_list.append(ds_name)\n", | |
" fp_list.append(fp)\n", | |
" for i in range(1,len(fp_list)):\n", | |
" sims = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])\n", | |
" dist_list.extend([1.-x for x in sims])\n", | |
" cs = Butina.ClusterData(data=dist_list,\n", | |
" nPts=len(fp_list),\n", | |
" distThresh=cutoff,\n", | |
" isDistData=True)\n", | |
" draw_list = list()\n", | |
" legend_list = list()\n", | |
" smi_list_cluster = list()\n", | |
" for cluster_idx, cluster in enumerate(cs):\n", | |
" # Some entries may be duplicate. So here we try\n", | |
" # to get the centroids (first entry in cluster), but\n", | |
" # if it already is in our list of molecules, then\n", | |
" # choose the next available one that is not already in\n", | |
" # the list.\n", | |
" for mol_idx in cluster:\n", | |
" if not smi_list[mol_idx] in smi_list_cluster:\n", | |
" smi_list_cluster.append(smi_list[mol_idx])\n", | |
" draw_list.append(mol_list[mol_idx])\n", | |
" clustcsv_fopen.write(f\"{substruc_name};{ds_list[mol_idx]};{cluster_idx};{index_list[mol_idx]};{smi_list[mol_idx]}\\n\")\n", | |
" legend_list.append(f\"{ds_list[mol_idx]};{cluster_idx}\")\n", | |
" break\n", | |
" for mol_idx in cluster:\n", | |
" allcsv_fopen.write(f\"{substruc_name};{ds_list[mol_idx]};{cluster_idx};{index_list[mol_idx]};{smi_list[mol_idx]}\\n\")\n", | |
" img=Draw.MolsToGridImage(draw_list, \n", | |
" molsPerRow=12,\n", | |
" subImgSize=(500,500),\n", | |
" legends=legend_list)\n", | |
" img.save(f\"{substruc_name}_clustercentroids.png\")\n", | |
"\n", | |
"clustcsv_fopen.close()\n", | |
"allcsv_fopen.close()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda env:psi4]", | |
"language": "python", | |
"name": "conda-env-psi4-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.8" | |
}, | |
"latex_envs": { | |
"LaTeX_envs_menu_present": true, | |
"autoclose": false, | |
"autocomplete": true, | |
"bibliofile": "biblio.bib", | |
"cite_by": "apalike", | |
"current_citInitial": 1, | |
"eqLabelWithNumbers": true, | |
"eqNumInitial": 1, | |
"hotkeys": { | |
"equation": "Ctrl-E", | |
"itemize": "Ctrl-I" | |
}, | |
"labels_anchors": false, | |
"latex_user_defs": false, | |
"report_style_numbering": false, | |
"user_envs_cfg": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment