wutobias/DiverseQCA.ipynb

## DiverseQCA.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What is this about?\n",
    "Here we want to get a set of molecules from QCArchive that is diverse with respect to specific chemical environments. First we download molecules from QCArchive and then compute a fingerprint for a set of substructure matches. Then we cluster the fingerprints using Tanimoto similarity.\n",
    "\n",
    "Note, you need packages qcportal and rdkit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import qcportal as ptl\n",
    "from rdkit.Chem import AllChem as Chem\n",
    "from rdkit.Chem import Draw\n",
    "from rdkit.Chem.Fingerprints import FingerprintMols\n",
    "from rdkit import DataStructs\n",
    "from rdkit.ML.Cluster import Butina"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>tagline</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>collection</th>\n",
       "      <th>name</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"22\" valign=\"top\">OptimizationDataset</th>\n",
       "      <th>FDA Optimization Dataset 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JGI Metabolite Set 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Kinase Inhibitors: WBO Distributions</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Discrepancy Benchmark 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Ehrman Informative Optimization v0.1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Ehrman Informative Optimization v0.2</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Full Optimization Benchmark 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Gen 2 Opt Set 1 Roche</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Gen 2 Opt Set 2 Coverage</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Gen 2 Opt Set 5 Bayer</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF NCI250K Boron 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Optimization Set 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Primary Optimization Benchmark 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Protein Fragments v1.0</th>\n",
       "      <td>Constrained optimization of various protein fr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF Protein Fragments v2.0</th>\n",
       "      <td>Constrained optimization of various protein fr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OpenFF VEHICLe Set 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PEI</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pfizer Discrepancy Optimization Dataset 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>QM8-T</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SMIRNOFF Coverage Set 1</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                             tagline\n",
       "collection          name                                                                                            \n",
       "OptimizationDataset FDA Optimization Dataset 1                                                                  None\n",
       "                    JGI Metabolite Set 1                                                                        None\n",
       "                    Kinase Inhibitors: WBO Distributions                                                        None\n",
       "                    OpenFF Discrepancy Benchmark 1                                                              None\n",
       "                    OpenFF Ehrman Informative Optimization v0.1                                                 None\n",
       "                    OpenFF Ehrman Informative Optimization v0.2                                                 None\n",
       "                    OpenFF Full Optimization Benchmark 1                                                        None\n",
       "                    OpenFF Gen 2 Opt Set 1 Roche                                                                None\n",
       "                    OpenFF Gen 2 Opt Set 2 Coverage                                                             None\n",
       "                    OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy                                                   None\n",
       "                    OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy                                               None\n",
       "                    OpenFF Gen 2 Opt Set 5 Bayer                                                                None\n",
       "                    OpenFF NCI250K Boron 1                                                                      None\n",
       "                    OpenFF Optimization Set 1                                                                   None\n",
       "                    OpenFF Primary Optimization Benchmark 1                                                     None\n",
       "                    OpenFF Protein Fragments v1.0                  Constrained optimization of various protein fr...\n",
       "                    OpenFF Protein Fragments v2.0                  Constrained optimization of various protein fr...\n",
       "                    OpenFF VEHICLe Set 1                                                                        None\n",
       "                    PEI                                                                                         None\n",
       "                    Pfizer Discrepancy Optimization Dataset 1                                                   None\n",
       "                    QM8-T                                                                                       None\n",
       "                    SMIRNOFF Coverage Set 1                                                                     None"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client = ptl.FractalClient(\"https://api.qcarchive.molssi.org:443/\")\n",
    "client.list_collections(\"OptimizationDataset\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define some high-level parameters\n",
    "maxHeavyAtoms: Maximum number of heavy numbers in a molecule\n",
    "\n",
    "maxLength    : Maximum topological distance between atoms in atom centered atom pair fingerprint\n",
    "\n",
    "cutoff       : maximum distance (i.e. 1-similarity) between chemical environments to be considered in the same cluster\n",
    "\n",
    "include_queries : (key,value) pairs with (name of the chemical environment, SMARTS encoding) for specifying the query chemical environments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "maxHeavyAtoms = 20\n",
    "maxLength     = 2\n",
    "cutoff        = 0.3\n",
    "\n",
    "include_queries = {\n",
    "    \"C-N-C\" : Chem.MolFromSmarts(\"[#6:1]-[#7:2]-[#6:3]\"),\n",
    "#    \"N-N-C\" : Chem.MolFromSmarts(\"[#7:1]-[#7:2]-[#6:3]\"),\n",
    "    \"C-O-C\" : Chem.MolFromSmarts(\"[#6:1]-[#8:2]-[#6:3]\"),\n",
    "#    \"O-O-C\" : Chem.MolFromSmarts(\"[#8:1]-[#8:2]-[#6:3]\")\n",
    "}\n",
    "\n",
    "#include_elements = [6, 8, 1]\n",
    "include_elements = [6, 8, 7, 1]\n",
    "\n",
    "### These are all optimization datasets in QCA \n",
    "query_sets = client.list_collections(\"OptimizationDataset\", aslist=True)\n",
    "\n",
    "### These are just the ones for FF optimization\n",
    "#query_sets = [\n",
    "#    \"FDA Optimization Dataset 1\",\n",
    "#    \"Pfizer Discrepancy Optimization Dataset 1\",\n",
    "#    \"SMIRNOFF Coverage Set 1\",\n",
    "#    \"OpenFF Gen 2 Opt Set 1 Roche\",\n",
    "#    \"OpenFF Gen 2 Opt Set 2 Coverage\",\n",
    "#    \"OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy\",\n",
    "#    \"OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy\",\n",
    "#    \"OpenFF Gen 2 Opt Set 5 Bayer\"]    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cluster chemical environments according to their atom center atom pair fingerprint\n",
    "\n",
    "Note that molecules may occure more than once if they a given query chemical environment shows up in a molecule multiple times."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tobias/progs/anaconda3/envs/psi4/lib/python3.7/site-packages/rdkit/Chem/Draw/IPythonConsole.py:188: UserWarning: Truncating the list of molecules to be displayed to 50. Change the maxMols value to display more.\n",
      "  % (maxMols))\n"
     ]
    }
   ],
   "source": [
    "def get_smi(fwd_index):\n",
    "    \n",
    "    bkw_index = fwd_index[::-1]\n",
    "    smi = \"\"\n",
    "    found_start = False\n",
    "    for s in bkw_index:\n",
    "        if not found_start and s == \"-\":\n",
    "            found_start = True\n",
    "            continue\n",
    "        if found_start:\n",
    "            smi += s\n",
    "    smi = smi[::-1]\n",
    "    return smi\n",
    "\n",
    "clustcsv_fopen = open(\"./InterestingMols.csv\", \"w\")\n",
    "allcsv_fopen   = open(\"./AllMols.csv\", \"w\")\n",
    "\n",
    "clustcsv_fopen.write(\"#SubstructureQuery;DataSet;Cluster;Index;Smiles\\n\")\n",
    "allcsv_fopen.write(\"#SubstructureQuery;DataSet;Cluster;Index;Smiles\\n\")\n",
    "\n",
    "for substruc_name in include_queries.keys():\n",
    "    smi_list   = list()\n",
    "    mol_list   = list()\n",
    "    ds_list    = list()\n",
    "    index_list = list()\n",
    "    fp_list    = list()\n",
    "    dist_list  = list()\n",
    "    for ds_name in query_sets:\n",
    "        if ds_name.startswith(\"OpenFF Protein Fragments\"):\n",
    "            continue\n",
    "        ds = client.get_collection('OptimizationDataset', ds_name)\n",
    "        for index in ds.df.index:\n",
    "            smi = get_smi(index)\n",
    "            mol = Chem.MolFromSmiles(smi)\n",
    "            if mol == None:\n",
    "                continue\n",
    "            if mol.GetNumHeavyAtoms() > maxHeavyAtoms:\n",
    "                continue\n",
    "            if not mol.HasSubstructMatch(include_queries[substruc_name]):\n",
    "                continue\n",
    "            can_smi = Chem.MolToSmiles(mol, True)\n",
    "            elements_ok = True\n",
    "            for atm in mol.GetAtoms():\n",
    "                if not atm.GetAtomicNum() in include_elements:\n",
    "                    elements_ok = False\n",
    "                    break\n",
    "            if not elements_ok:\n",
    "                continue\n",
    "            if can_smi not in smi_list:\n",
    "                matches = mol.GetSubstructMatches(include_queries[substruc_name])\n",
    "                for match in matches:\n",
    "                    fp      = Chem.GetHashedAtomPairFingerprintAsBitVect(mol, \n",
    "                                                                   maxLength=maxLength, \n",
    "                                                                   fromAtoms=list(match))\n",
    "                    smi_list.append(can_smi)\n",
    "                    mol_list.append(mol)\n",
    "                    index_list.append(index)\n",
    "                    ds_list.append(ds_name)\n",
    "                    fp_list.append(fp)\n",
    "    for i in range(1,len(fp_list)):\n",
    "        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])\n",
    "        dist_list.extend([1.-x for x in sims])\n",
    "    cs = Butina.ClusterData(data=dist_list,\n",
    "                            nPts=len(fp_list),\n",
    "                            distThresh=cutoff,\n",
    "                            isDistData=True)\n",
    "    draw_list        = list()\n",
    "    legend_list      = list()\n",
    "    smi_list_cluster = list()\n",
    "    for cluster_idx, cluster in enumerate(cs):\n",
    "        # Some entries may be duplicate. So here we try\n",
    "        # to get the centroids (first entry in cluster), but\n",
    "        # if it already is in our list of molecules, then\n",
    "        # choose the next available one that is not already in\n",
    "        # the list.\n",
    "        for mol_idx in cluster:\n",
    "            if not smi_list[mol_idx] in smi_list_cluster:\n",
    "                smi_list_cluster.append(smi_list[mol_idx])\n",
    "                draw_list.append(mol_list[mol_idx])\n",
    "                clustcsv_fopen.write(f\"{substruc_name};{ds_list[mol_idx]};{cluster_idx};{index_list[mol_idx]};{smi_list[mol_idx]}\\n\")\n",
    "                legend_list.append(f\"{ds_list[mol_idx]};{cluster_idx}\")\n",
    "                break\n",
    "        for mol_idx in cluster:\n",
    "            allcsv_fopen.write(f\"{substruc_name};{ds_list[mol_idx]};{cluster_idx};{index_list[mol_idx]};{smi_list[mol_idx]}\\n\")\n",
    "    img=Draw.MolsToGridImage(draw_list, \n",
    "                             molsPerRow=12,\n",
    "                             subImgSize=(500,500),\n",
    "                             legends=legend_list)\n",
    "    img.save(f\"{substruc_name}_clustercentroids.png\")\n",
    "\n",
    "clustcsv_fopen.close()\n",
    "allcsv_fopen.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:psi4]",
   "language": "python",
   "name": "conda-env-psi4-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### What is this about?\n",
	"Here we want to get a set of molecules from QCArchive that is diverse with respect to specific chemical environments. First we download molecules from QCArchive and then compute a fingerprint for a set of substructure matches. Then we cluster the fingerprints using Tanimoto similarity.\n",
	"\n",
	"Note, you need packages qcportal and rdkit"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import qcportal as ptl\n",
	"from rdkit.Chem import AllChem as Chem\n",
	"from rdkit.Chem import Draw\n",
	"from rdkit.Chem.Fingerprints import FingerprintMols\n",
	"from rdkit import DataStructs\n",
	"from rdkit.ML.Cluster import Butina"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th>tagline</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>collection</th>\n",
	" <th>name</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th rowspan=\"22\" valign=\"top\">OptimizationDataset</th>\n",
	" <th>FDA Optimization Dataset 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>JGI Metabolite Set 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Kinase Inhibitors: WBO Distributions</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Discrepancy Benchmark 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Ehrman Informative Optimization v0.1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Ehrman Informative Optimization v0.2</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Full Optimization Benchmark 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Gen 2 Opt Set 1 Roche</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Gen 2 Opt Set 2 Coverage</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Gen 2 Opt Set 5 Bayer</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF NCI250K Boron 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Optimization Set 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Primary Optimization Benchmark 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Protein Fragments v1.0</th>\n",
	" <td>Constrained optimization of various protein fr...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF Protein Fragments v2.0</th>\n",
	" <td>Constrained optimization of various protein fr...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>OpenFF VEHICLe Set 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>PEI</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>Pfizer Discrepancy Optimization Dataset 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>QM8-T</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>SMIRNOFF Coverage Set 1</th>\n",
	" <td>None</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" tagline\n",
	"collection name \n",
	"OptimizationDataset FDA Optimization Dataset 1 None\n",
	" JGI Metabolite Set 1 None\n",
	" Kinase Inhibitors: WBO Distributions None\n",
	" OpenFF Discrepancy Benchmark 1 None\n",
	" OpenFF Ehrman Informative Optimization v0.1 None\n",
	" OpenFF Ehrman Informative Optimization v0.2 None\n",
	" OpenFF Full Optimization Benchmark 1 None\n",
	" OpenFF Gen 2 Opt Set 1 Roche None\n",
	" OpenFF Gen 2 Opt Set 2 Coverage None\n",
	" OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy None\n",
	" OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy None\n",
	" OpenFF Gen 2 Opt Set 5 Bayer None\n",
	" OpenFF NCI250K Boron 1 None\n",
	" OpenFF Optimization Set 1 None\n",
	" OpenFF Primary Optimization Benchmark 1 None\n",
	" OpenFF Protein Fragments v1.0 Constrained optimization of various protein fr...\n",
	" OpenFF Protein Fragments v2.0 Constrained optimization of various protein fr...\n",
	" OpenFF VEHICLe Set 1 None\n",
	" PEI None\n",
	" Pfizer Discrepancy Optimization Dataset 1 None\n",
	" QM8-T None\n",
	" SMIRNOFF Coverage Set 1 None"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"client = ptl.FractalClient(\"https://api.qcarchive.molssi.org:443/\")\n",
	"client.list_collections(\"OptimizationDataset\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Define some high-level parameters\n",
	"maxHeavyAtoms: Maximum number of heavy numbers in a molecule\n",
	"\n",
	"maxLength : Maximum topological distance between atoms in atom centered atom pair fingerprint\n",
	"\n",
	"cutoff : maximum distance (i.e. 1-similarity) between chemical environments to be considered in the same cluster\n",
	"\n",
	"include_queries : (key,value) pairs with (name of the chemical environment, SMARTS encoding) for specifying the query chemical environments"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"maxHeavyAtoms = 20\n",
	"maxLength = 2\n",
	"cutoff = 0.3\n",
	"\n",
	"include_queries = {\n",
	" \"C-N-C\" : Chem.MolFromSmarts(\"[#6:1]-[#7:2]-[#6:3]\"),\n",
	"# \"N-N-C\" : Chem.MolFromSmarts(\"[#7:1]-[#7:2]-[#6:3]\"),\n",
	" \"C-O-C\" : Chem.MolFromSmarts(\"[#6:1]-[#8:2]-[#6:3]\"),\n",
	"# \"O-O-C\" : Chem.MolFromSmarts(\"[#8:1]-[#8:2]-[#6:3]\")\n",
	"}\n",
	"\n",
	"#include_elements = [6, 8, 1]\n",
	"include_elements = [6, 8, 7, 1]\n",
	"\n",
	"### These are all optimization datasets in QCA \n",
	"query_sets = client.list_collections(\"OptimizationDataset\", aslist=True)\n",
	"\n",
	"### These are just the ones for FF optimization\n",
	"#query_sets = [\n",
	"# \"FDA Optimization Dataset 1\",\n",
	"# \"Pfizer Discrepancy Optimization Dataset 1\",\n",
	"# \"SMIRNOFF Coverage Set 1\",\n",
	"# \"OpenFF Gen 2 Opt Set 1 Roche\",\n",
	"# \"OpenFF Gen 2 Opt Set 2 Coverage\",\n",
	"# \"OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy\",\n",
	"# \"OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy\",\n",
	"# \"OpenFF Gen 2 Opt Set 5 Bayer\"] "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Cluster chemical environments according to their atom center atom pair fingerprint\n",
	"\n",
	"Note that molecules may occure more than once if they a given query chemical environment shows up in a molecule multiple times."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/tobias/progs/anaconda3/envs/psi4/lib/python3.7/site-packages/rdkit/Chem/Draw/IPythonConsole.py:188: UserWarning: Truncating the list of molecules to be displayed to 50. Change the maxMols value to display more.\n",
	" % (maxMols))\n"
	]
	}
	],
	"source": [
	"def get_smi(fwd_index):\n",
	" \n",
	" bkw_index = fwd_index[::-1]\n",
	" smi = \"\"\n",
	" found_start = False\n",
	" for s in bkw_index:\n",
	" if not found_start and s == \"-\":\n",
	" found_start = True\n",
	" continue\n",
	" if found_start:\n",
	" smi += s\n",
	" smi = smi[::-1]\n",
	" return smi\n",
	"\n",
	"clustcsv_fopen = open(\"./InterestingMols.csv\", \"w\")\n",
	"allcsv_fopen = open(\"./AllMols.csv\", \"w\")\n",
	"\n",
	"clustcsv_fopen.write(\"#SubstructureQuery;DataSet;Cluster;Index;Smiles\\n\")\n",
	"allcsv_fopen.write(\"#SubstructureQuery;DataSet;Cluster;Index;Smiles\\n\")\n",
	"\n",
	"for substruc_name in include_queries.keys():\n",
	" smi_list = list()\n",
	" mol_list = list()\n",
	" ds_list = list()\n",
	" index_list = list()\n",
	" fp_list = list()\n",
	" dist_list = list()\n",
	" for ds_name in query_sets:\n",
	" if ds_name.startswith(\"OpenFF Protein Fragments\"):\n",
	" continue\n",
	" ds = client.get_collection('OptimizationDataset', ds_name)\n",
	" for index in ds.df.index:\n",
	" smi = get_smi(index)\n",
	" mol = Chem.MolFromSmiles(smi)\n",
	" if mol == None:\n",
	" continue\n",
	" if mol.GetNumHeavyAtoms() > maxHeavyAtoms:\n",
	" continue\n",
	" if not mol.HasSubstructMatch(include_queries[substruc_name]):\n",
	" continue\n",
	" can_smi = Chem.MolToSmiles(mol, True)\n",
	" elements_ok = True\n",
	" for atm in mol.GetAtoms():\n",
	" if not atm.GetAtomicNum() in include_elements:\n",
	" elements_ok = False\n",
	" break\n",
	" if not elements_ok:\n",
	" continue\n",
	" if can_smi not in smi_list:\n",
	" matches = mol.GetSubstructMatches(include_queries[substruc_name])\n",
	" for match in matches:\n",
	" fp = Chem.GetHashedAtomPairFingerprintAsBitVect(mol, \n",
	" maxLength=maxLength, \n",
	" fromAtoms=list(match))\n",
	" smi_list.append(can_smi)\n",
	" mol_list.append(mol)\n",
	" index_list.append(index)\n",
	" ds_list.append(ds_name)\n",
	" fp_list.append(fp)\n",
	" for i in range(1,len(fp_list)):\n",
	" sims = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])\n",
	" dist_list.extend([1.-x for x in sims])\n",
	" cs = Butina.ClusterData(data=dist_list,\n",
	" nPts=len(fp_list),\n",
	" distThresh=cutoff,\n",
	" isDistData=True)\n",
	" draw_list = list()\n",
	" legend_list = list()\n",
	" smi_list_cluster = list()\n",
	" for cluster_idx, cluster in enumerate(cs):\n",
	" # Some entries may be duplicate. So here we try\n",
	" # to get the centroids (first entry in cluster), but\n",
	" # if it already is in our list of molecules, then\n",
	" # choose the next available one that is not already in\n",
	" # the list.\n",
	" for mol_idx in cluster:\n",
	" if not smi_list[mol_idx] in smi_list_cluster:\n",
	" smi_list_cluster.append(smi_list[mol_idx])\n",
	" draw_list.append(mol_list[mol_idx])\n",
	" clustcsv_fopen.write(f\"{substruc_name};{ds_list[mol_idx]};{cluster_idx};{index_list[mol_idx]};{smi_list[mol_idx]}\\n\")\n",
	" legend_list.append(f\"{ds_list[mol_idx]};{cluster_idx}\")\n",
	" break\n",
	" for mol_idx in cluster:\n",
	" allcsv_fopen.write(f\"{substruc_name};{ds_list[mol_idx]};{cluster_idx};{index_list[mol_idx]};{smi_list[mol_idx]}\\n\")\n",
	" img=Draw.MolsToGridImage(draw_list, \n",
	" molsPerRow=12,\n",
	" subImgSize=(500,500),\n",
	" legends=legend_list)\n",
	" img.save(f\"{substruc_name}_clustercentroids.png\")\n",
	"\n",
	"clustcsv_fopen.close()\n",
	"allcsv_fopen.close()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [conda env:psi4]",
	"language": "python",
	"name": "conda-env-psi4-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.8"
	},
	"latex_envs": {
	"LaTeX_envs_menu_present": true,
	"autoclose": false,
	"autocomplete": true,
	"bibliofile": "biblio.bib",
	"cite_by": "apalike",
	"current_citInitial": 1,
	"eqLabelWithNumbers": true,
	"eqNumInitial": 1,
	"hotkeys": {
	"equation": "Ctrl-E",
	"itemize": "Ctrl-I"
	},
	"labels_anchors": false,
	"latex_user_defs": false,
	"report_style_numbering": false,
	"user_envs_cfg": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}