sshojiro/extract-linear-fragments.ipynb

## extract-linear-fragments.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract of Linear Fragments\n",
    "\n",
    "Functions defined in this notebook is:\n",
    "\n",
    "- bond_indices_to_nodes(mol, bond_indices)\n",
    "  - input: **mol** Chem.Mol\n",
    "  - input: **bond_indices** possible pairs of bond indices, that form linear fragments\n",
    "  - generate pairs of bond indices that form linear fragments\n",
    "- list_end_points_indices(G)\n",
    "  - input: **G** networkx.Graph\n",
    "- sort_atoms_in_indices(ix_start,g)\n",
    "    - input: **ix_start** index of the endpoint\n",
    "    - input: **g** networkx.Graph\n",
    "- generate_linear_fragments(mol, n_len=3)\n",
    "    - input: **mol** Chem.Mol\n",
    "    - input: **n_len** number of length of the fragment\n",
    "    - generate all possible linear fragments in SMILES-like format with explicit single bond"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit.Chem import rdmolops\n",
    "from rdkit import Chem\n",
    "from rdkit.Chem.Draw import IPythonConsole\n",
    "from rdkit.Chem.Draw import DrawingOptions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "smi='CC(C)CC1=CC=C(C=C1)C(C)C(=O)O'\n",
    "mol=Chem.MolFromSmiles(smi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: unable to load font metrics from dir c:\\users\\shojiro_shibayama\\miniconda3\\envs\\cheminfo\\lib\\site-packages\\rdkit\\sping\\PIL\\pilfonts\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAEsCAIAAAD2HxkiAAAGqklEQVR4nO3d0XabVhCGUejq+7+yekHjqDJWiST455zZ+yrximOM9DGAEFpvt9sC5PyVXgDoToQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIusa5Hv9iPCDmfAp8SIee73R6/sq47X+xKhBAmQi5nDP6XCElY13+PCR0ZLsvf6QWgn68xaCQuy2ISQtx6a7MpWn/t+fT5lRlCl93Rdf29ubn/M8R12R1VHWV1iRDK6rI7utkOC01FSmkUoUNBauqyO6rAYfR7+b7XJPz6syDrut26vYjfJULVUVaX3VFGsg3DNkQIYSKkpE7DUIRUdbutPToUIXXdenTYKMIODycjahQhI+owDLtE6IqZcU3fYZcIGdqJG9CHvLeb31zbfIsrZoxB9n0vMHH/G5OQuh72QtdfPvYDHjILbannn4TG4KC+F/iBG5Rs/+fz7z3ybz5q/ggZ1MP5mBeq+8+3//pfjvzg7Zsv63DyCI3BzkZ56CePkM00t3s89wYlofcxzhyhMbiZ5naPpy/8/VXjjgn7eDj9cHYhCnxU4ATptBEOtMm/ZjknuNPcrDcoGeaZOqsLNhb3+Q20bXrfztnR338vtBKmnYQDueCsSZ/w7o3yWw8f4eib9vuFf+13eXIFydBrpo+xI5z74vp775R2/6q3LAs6N8KzH/sJ3uRycPq9uQK1V9mJEU7z8tSpjCnG3h2dg/aOmHg77q1MEFZ3Ej4/2Jt1o8iuicfgEo/Q6XUIR/h+aVp9X/E5U3zx3ndihM77jWJ7pDxGKedOQo/rEQJ4osPKcXaUZZnisodxiTCsw5b+ZU1Wjgg72h16hmGKCJNSW/ohemsyBpe5I6z/PNsVXOwh4pzPzBEWf0rtbukv2/yPuHJmNXOEPLfbYfE4pzR5hGWfUtkxWF+r9TB5hEvhDiuIDMPTP+ZlNPNHuNTrsP4YPG9Jdj/mZVPqMbpS3bcycY2fLhw96a7EnWP7SZcI61yjXHAM7v7oh9vAXbg47bTYHd3YBr+myMZrYo0iLKLaGCSuV4QXD8ODP6tygTYQF+hyTPjlsqdU213fn64O3/3H3vm9tP1AmLMf+G2AjD5Gjiz/7suMpy3RnNpNwsVdiT/K2ntfr2PCzQUfRTbHU3OO36K+SZ4uLzjvQzMv/vBdRtc3ws2pU2uakcipOh4TvsmtwfmsjhG+dtLvy/HMBMkRTfeXvDY1hCb78y1+ScbVocOOL1FwUNuLfi4mQvYVGUEd3vsiQggTITuKjMHN9MNQhAFzP6XOMHeHIgwo/pQqNQY7ECFjKL7leocIM8o+pYzB64kwpmyHZc26xkTIb/XHYPHFe40Ik2bdtPNHRBhWp8PiY/D7Wiqy3t4nwrw6HZY1cYGLCPlSeQx+X7bKS/unRFiCYdhZx3fW1/SRTfv/3njXu5kLEuGQfhqbz9Nyw9WaRFjI8TGln5mIsIoLxpQPfqhJhI0MvTs68QlSZ0fLcZq0G5Owiq99xXdervjI7VK5mAgLOdLJO/f/dkxYkwirOHiQ9mY82itosKPzuRlTPYkQwpwdhTARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhP0DDi2aYUEqz6EAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<PIL.Image.Image image mode=RGB size=300x300 at 0x1BA6B5ABDD8>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def show_mol(m):\n",
    "    DrawingOptions.includeAtomNumbers=True\n",
    "    view= Chem.Draw.MolToImage(m)\n",
    "    DrawingOptions.includeAtomNumbers=False\n",
    "    return view\n",
    "show_mol(mol)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Irrelavant method\n",
    "\n",
    "`rdmolops.FindAllSubgraphsOfLengthN` is not useful this time, because we want to extract linear fragment, not branched fragments."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{7: {10: {}}, 10: {7: {}, 12: {}}, 12: {10: {}, 14: {}}, 14: {12: {}}}\n",
      "<dict_itemiterator object at 0x000001BA6C083C28>\n"
     ]
    }
   ],
   "source": [
    "import networkx as nx\n",
    "\n",
    "\n",
    "n_len=3 # 3-lengths paths\n",
    "lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)\n",
    "\n",
    "G = nx.Graph()\n",
    "lst=[]\n",
    "for bond_ix in lst_paths[16]:# 1, 16\n",
    "    bond=mol.GetBondWithIdx(bond_ix)\n",
    "    lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
    "G.add_edges_from(lst)\n",
    "print(G.adj)\n",
    "print(G.adjacency())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[11, 14]\n",
      "[11, 10, 12, 14] [14, 12, 10, 11]\n"
     ]
    }
   ],
   "source": [
    "def bond_indices_to_nodes(m, bond_indices):\n",
    "    G = nx.Graph()\n",
    "    lst=[]\n",
    "    for bond_ix in bond_indices:\n",
    "        bond= m.GetBondWithIdx(bond_ix)\n",
    "        lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
    "    G.add_edges_from(lst)\n",
    "    return G\n",
    "def list_end_points_indices(g):\n",
    "    return [k for k,v in g.adjacency()if len(v)==1]\n",
    "def sort_atoms_in_indices(ix_start,g):\n",
    "    indices_sorted = []\n",
    "    atom_indices = set(g.nodes)\n",
    "    atom_indices -= {ix_start}\n",
    "    next_ix = ix_start\n",
    "    indices_sorted += [next_ix]\n",
    "    while any(k in atom_indices for k in list(g.adj[next_ix].keys())):\n",
    "        if list(g.adj[next_ix].keys())[0] in atom_indices:\n",
    "            next_ix = list(g.adj[next_ix].keys())[0]\n",
    "        elif list(g.adj[next_ix].keys())[1] in atom_indices:\n",
    "            next_ix = list(g.adj[next_ix].keys())[1]\n",
    "        indices_sorted += [next_ix]\n",
    "        atom_indices -= {next_ix}\n",
    "    return indices_sorted\n",
    "\n",
    "gg = bond_indices_to_nodes(mol,list(lst_paths[-1]))\n",
    "ind = list_end_points_indices(gg)\n",
    "print(ind)\n",
    "print(sort_atoms_in_indices(ind[0], gg),sort_atoms_in_indices(ind[1], gg))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==========\n",
      "[(0, 1), (1, 3), (3, 4)]\n",
      "start and end [0, 4]\n",
      "function output [0, 1, 3, 4]\n",
      "==========\n",
      "[(1, 3), (3, 4), (4, 5)]\n",
      "start and end [1, 5]\n",
      "function output [1, 3, 4, 5]\n",
      "==========\n",
      "[(1, 3), (3, 4), (9, 4)]\n",
      "start and end [1, 9]\n",
      "function output [1, 3, 4, 9]\n",
      "==========\n",
      "[(1, 2), (1, 3), (3, 4)]\n",
      "start and end [2, 4]\n",
      "function output [2, 1, 3, 4]\n",
      "==========\n",
      "[(3, 4), (4, 5), (5, 6)]\n",
      "start and end [3, 6]\n",
      "function output [3, 4, 5, 6]\n",
      "==========\n",
      "[(3, 4), (9, 4), (8, 9)]\n",
      "start and end [3, 8]\n",
      "function output [3, 4, 9, 8]\n",
      "==========\n",
      "[(4, 5), (5, 6), (6, 7)]\n",
      "start and end [4, 7]\n",
      "function output [4, 5, 6, 7]\n",
      "==========\n",
      "[(9, 4), (8, 9), (7, 8)]\n",
      "start and end [4, 7]\n",
      "function output [4, 9, 8, 7]\n",
      "==========\n",
      "[(4, 5), (9, 4), (8, 9)]\n",
      "start and end [5, 8]\n",
      "function output [5, 4, 9, 8]\n",
      "==========\n",
      "[(5, 6), (6, 7), (7, 8)]\n",
      "start and end [5, 8]\n",
      "function output [5, 6, 7, 8]\n",
      "==========\n",
      "[(5, 6), (6, 7), (7, 10)]\n",
      "start and end [5, 10]\n",
      "function output [5, 6, 7, 10]\n",
      "==========\n",
      "[(5, 6), (4, 5), (9, 4)]\n",
      "start and end [6, 9]\n",
      "function output [6, 5, 4, 9]\n",
      "==========\n",
      "[(6, 7), (7, 8), (8, 9)]\n",
      "start and end [6, 9]\n",
      "function output [6, 7, 8, 9]\n",
      "==========\n",
      "[(6, 7), (7, 10), (10, 11)]\n",
      "start and end [6, 11]\n",
      "function output [6, 7, 10, 11]\n",
      "==========\n",
      "[(6, 7), (7, 10), (10, 12)]\n",
      "start and end [6, 12]\n",
      "function output [6, 7, 10, 12]\n",
      "==========\n",
      "[(7, 10), (10, 12), (12, 13)]\n",
      "start and end [7, 13]\n",
      "function output [7, 10, 12, 13]\n",
      "==========\n",
      "[(7, 10), (10, 12), (12, 14)]\n",
      "start and end [7, 14]\n",
      "function output [7, 10, 12, 14]\n",
      "==========\n",
      "[(7, 8), (7, 10), (10, 11)]\n",
      "start and end [8, 11]\n",
      "function output [8, 7, 10, 11]\n",
      "==========\n",
      "[(7, 8), (7, 10), (10, 12)]\n",
      "start and end [8, 12]\n",
      "function output [8, 7, 10, 12]\n",
      "==========\n",
      "[(8, 9), (7, 8), (7, 10)]\n",
      "start and end [9, 10]\n",
      "function output [9, 8, 7, 10]\n",
      "==========\n",
      "[(10, 11), (10, 12), (12, 13)]\n",
      "start and end [11, 13]\n",
      "function output [11, 10, 12, 13]\n",
      "==========\n",
      "[(10, 11), (10, 12), (12, 14)]\n",
      "start and end [11, 14]\n",
      "function output [11, 10, 12, 14]\n"
     ]
    }
   ],
   "source": [
    "for p in lst_paths:\n",
    "    print('='*10)\n",
    "    lst=[]\n",
    "    for bond_ix in p:\n",
    "        bond = mol.GetBondWithIdx(bond_ix)\n",
    "        lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
    "    print(lst)\n",
    "    gg = bond_indices_to_nodes(mol, list(p))\n",
    "    ind = list_end_points_indices(gg)\n",
    "    print('start and end', ind)\n",
    "    print('function output',sort_atoms_in_indices(ind[0],gg))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_linear_fragments(mol, n_len=3):\n",
    "    lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)\n",
    "    smiles = []\n",
    "    lst_indices = []\n",
    "    for p in lst_paths:\n",
    "        lst=[]\n",
    "        for bond_ix in p:\n",
    "            bond = mol.GetBondWithIdx(bond_ix)\n",
    "            lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
    "        gg = bond_indices_to_nodes(mol, list(p))\n",
    "        ind = list_end_points_indices(gg)\n",
    "        atom_indices = list(sort_atoms_in_indices(ind[0],gg))\n",
    "        lst_indices += [atom_indices]\n",
    "        smi=''\n",
    "        for ix in range(n_len):\n",
    "            smi += mol.GetAtomWithIdx(atom_indices[ix]).GetSymbol()\n",
    "            bond = mol.GetBondBetweenAtoms(atom_indices[ix],atom_indices[ix+1])        \n",
    "            if bond.GetBondType() is Chem.BondType.SINGLE:\n",
    "                smi += '-'# explicitly shown, compared to aromatic bond.\n",
    "            elif bond.GetBondType() is Chem.BondType.DOUBLE:\n",
    "                smi += '='\n",
    "            elif bond.GetBondType() is Chem.BondType.TRIPLE:\n",
    "                smi += '#'\n",
    "            elif bond.GetBondType() is Chem.BondType.QUADRUPLE:\n",
    "                smi += '$'\n",
    "        else:\n",
    "            smi += mol.GetAtomWithIdx(atom_indices[ix+1]).GetSymbol()\n",
    "        smiles += [smi]\n",
    "    return smiles, lst_indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0, 1, 3, 4, 5, 6],\n",
      " [0, 1, 3, 4, 9, 8],\n",
      " [1, 3, 4, 5, 6, 7],\n",
      " [1, 3, 4, 9, 8, 7],\n",
      " [2, 1, 3, 4, 5, 6],\n",
      " [2, 1, 3, 4, 9, 8],\n",
      " [3, 4, 5, 6, 7, 8],\n",
      " [3, 4, 5, 6, 7, 10],\n",
      " [3, 4, 9, 8, 7, 6],\n",
      " [3, 4, 9, 8, 7, 10],\n",
      " [4, 5, 6, 7, 8, 9],\n",
      " [4, 5, 6, 7, 10, 11],\n",
      " [4, 5, 6, 7, 10, 12],\n",
      " [4, 9, 8, 7, 6, 5],\n",
      " [4, 9, 8, 7, 10, 11],\n",
      " [4, 9, 8, 7, 10, 12],\n",
      " [5, 4, 9, 8, 7, 6],\n",
      " [5, 4, 9, 8, 7, 10],\n",
      " [5, 6, 7, 10, 12, 13],\n",
      " [5, 6, 7, 10, 12, 14],\n",
      " [6, 5, 4, 9, 8, 7],\n",
      " [7, 6, 5, 4, 9, 8],\n",
      " [8, 7, 6, 5, 4, 9],\n",
      " [9, 4, 5, 6, 7, 10],\n",
      " [9, 8, 7, 10, 12, 13],\n",
      " [9, 8, 7, 10, 12, 14]]\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "for l in range(1, 6):\n",
    "    smis, atom_ixs = generate_linear_fragments(mol, l)\n",
    "else:\n",
    "    pprint(atom_ixs)\n",
    "#     print('length items:', l,len(smis))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['C-C-C-CCC', 'C-C-C-CCC', 'C-C-CCCC', 'C-C-CCCC', 'C-C-C-CCC']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "smis, atom_ixs = generate_linear_fragments(mol, l)\n",
    "smis[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==========\n",
      "SINGLE\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCCC\n",
      "==========\n",
      "SINGLE\n",
      "SINGLE\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "SINGLE\n",
      "SINGLE\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "SINGLE\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCCC\n",
      "==========\n",
      "SINGLE\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "SINGLE\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "SINGLE\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCCC\n",
      "==========\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCC=O\n",
      "==========\n",
      "SINGLE\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCCO\n",
      "==========\n",
      "AROMATIC\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCCC\n",
      "==========\n",
      "AROMATIC\n",
      "AROMATIC\n",
      "SINGLE\n",
      "CCCC\n",
      "==========\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCC=O\n",
      "==========\n",
      "SINGLE\n",
      "SINGLE\n",
      "SINGLE\n",
      "CCCO\n"
     ]
    }
   ],
   "source": [
    "n_len=3 # 3-lengths paths\n",
    "lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)\n",
    "for p in lst_paths:\n",
    "    print('='*10)\n",
    "    lst=[]\n",
    "    for bond_ix in p:\n",
    "        bond = mol.GetBondWithIdx(bond_ix)\n",
    "        lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
    "    gg = bond_indices_to_nodes(mol, list(p))\n",
    "    ind = list_end_points_indices(gg)\n",
    "    atom_indices = list(sort_atoms_in_indices(ind[0],gg))\n",
    "    \n",
    "    smi=''\n",
    "    for ix in range(n_len):\n",
    "        smi += mol.GetAtomWithIdx(atom_indices[ix]).GetSymbol()\n",
    "        bond = mol.GetBondBetweenAtoms(atom_indices[ix],atom_indices[ix+1])        \n",
    "        if bond.GetBondType() is Chem.BondType.DOUBLE:\n",
    "            smi += '='\n",
    "        elif bond.GetBondType() is Chem.BondType.TRIPLE:\n",
    "            smi += '#'\n",
    "        else:\n",
    "            print(bond.GetBondType())\n",
    "    else:\n",
    "        smi += mol.GetAtomWithIdx(atom_indices[ix+1]).GetSymbol()\n",
    "    print(smi)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Proof that GetShortestPath is not the best in this case\n",
    "\n",
    "The code below using `FindAllPathsOfLengthN` is also not the best, because atom indices in `tup_atom_inter_ix` are randomly aligned."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 5\n",
      "0 1 3 4 5\n",
      "0 1 3 4 5\n",
      "0 9\n",
      "0 1 3 4 9\n",
      "0 1 3 4 9\n",
      "1 6\n",
      "1 3 4 5 6\n",
      "1 3 4 5 6\n",
      "1 8\n",
      "1 9 3 4 8\n",
      "1 3 4 9 8\n",
      "2 5\n",
      "2 1 3 4 5\n",
      "2 1 3 4 5\n",
      "2 9\n",
      "2 1 3 4 9\n",
      "2 1 3 4 9\n",
      "3 7\n",
      "3 4 5 6 7\n",
      "3 4 5 6 7\n",
      "3 7\n",
      "3 8 9 4 7\n",
      "3 4 5 6 7\n",
      "4 8\n",
      "4 5 6 7 8\n",
      "4 9 8\n",
      "4 10\n",
      "4 5 6 7 10\n",
      "4 5 6 7 10\n",
      "4 6\n",
      "4 8 9 7 6\n",
      "4 5 6\n",
      "4 10\n",
      "4 8 9 7 10\n",
      "4 5 6 7 10\n",
      "5 7\n",
      "5 8 9 4 7\n",
      "5 6 7\n",
      "5 9\n",
      "5 8 6 7 9\n",
      "5 4 9\n",
      "5 11\n",
      "5 10 6 7 11\n",
      "5 6 7 10 11\n",
      "5 12\n",
      "5 10 6 7 12\n",
      "5 6 7 10 12\n",
      "6 8\n",
      "6 9 4 5 8\n",
      "6 7 8\n",
      "6 13\n",
      "6 10 12 7 13\n",
      "6 7 10 12 13\n",
      "6 14\n",
      "6 10 12 7 14\n",
      "6 7 10 12 14\n",
      "7 9\n",
      "7 4 5 6 9\n",
      "7 8 9\n",
      "8 13\n",
      "8 10 12 7 13\n",
      "8 7 10 12 13\n",
      "8 14\n",
      "8 10 12 7 14\n",
      "8 7 10 12 14\n",
      "9 11\n",
      "9 8 10 7 11\n",
      "9 8 7 10 11\n",
      "9 12\n",
      "9 8 10 7 12\n",
      "9 8 7 10 12\n"
     ]
    }
   ],
   "source": [
    "n_len=4\n",
    "lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,\n",
    "                                           useBonds=True,\n",
    "                                           useHs=True)\n",
    "for pathset in lst_paths:\n",
    "    tup_atom_inter_ix = {mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx()\n",
    "    for bond_ix in pathset[1:-1]} | {\n",
    "        mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()\n",
    "    for bond_ix in pathset[1:-1]}\n",
    "    bond_ix = pathset[0]\n",
    "    atom_ix_start={mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx(),\n",
    "     mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()}-tup_atom_inter_ix\n",
    "\n",
    "    bond_ix = pathset[-1]\n",
    "    atom_ix_end={mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx(),\n",
    "     mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()}-tup_atom_inter_ix\n",
    "    print(list(atom_ix_start)[0],list(atom_ix_end)[0])\n",
    "    print(list(atom_ix_start)[0],*list(tup_atom_inter_ix),list(atom_ix_end)[0])\n",
    "    print(*rdmolops.GetShortestPath(mol,\n",
    "                             list(atom_ix_start)[0],list(atom_ix_end)[0]))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Extract of Linear Fragments\n",
	"\n",
	"Functions defined in this notebook is:\n",
	"\n",
	"- bond_indices_to_nodes(mol, bond_indices)\n",
	" - input: mol Chem.Mol\n",
	" - input: bond_indices possible pairs of bond indices, that form linear fragments\n",
	" - generate pairs of bond indices that form linear fragments\n",
	"- list_end_points_indices(G)\n",
	" - input: G networkx.Graph\n",
	"- sort_atoms_in_indices(ix_start,g)\n",
	" - input: ix_start index of the endpoint\n",
	" - input: g networkx.Graph\n",
	"- generate_linear_fragments(mol, n_len=3)\n",
	" - input: mol Chem.Mol\n",
	" - input: n_len number of length of the fragment\n",
	" - generate all possible linear fragments in SMILES-like format with explicit single bond"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from rdkit.Chem import rdmolops\n",
	"from rdkit import Chem\n",
	"from rdkit.Chem.Draw import IPythonConsole\n",
	"from rdkit.Chem.Draw import DrawingOptions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"smi='CC(C)CC1=CC=C(C=C1)C(C)C(=O)O'\n",
	"mol=Chem.MolFromSmiles(smi)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Warning: unable to load font metrics from dir c:\\users\\shojiro_shibayama\\miniconda3\\envs\\cheminfo\\lib\\site-packages\\rdkit\\sping\\PIL\\pilfonts\n"
	]
	},
	{
	"data": {
	"image/png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAEsCAIAAAD2HxkiAAAGqklEQVR4nO3d0XabVhCGUejq+7+yekHjqDJWiST455zZ+yrximOM9DGAEFpvt9sC5PyVXgDoToQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIusa5Hv9iPCDmfAp8SIee73R6/sq47X+xKhBAmQi5nDP6XCElY13+PCR0ZLsvf6QWgn68xaCQuy2ISQtx6a7MpWn/t+fT5lRlCl93Rdf29ubn/M8R12R1VHWV1iRDK6rI7utkOC01FSmkUoUNBauqyO6rAYfR7+b7XJPz6syDrut26vYjfJULVUVaX3VFGsg3DNkQIYSKkpE7DUIRUdbutPToUIXXdenTYKMIODycjahQhI+owDLtE6IqZcU3fYZcIGdqJG9CHvLeb31zbfIsrZoxB9n0vMHH/G5OQuh72QtdfPvYDHjILbannn4TG4KC+F/iBG5Rs/+fz7z3ybz5q/ggZ1MP5mBeq+8+3//pfjvzg7Zsv63DyCI3BzkZ56CePkM00t3s89wYlofcxzhyhMbiZ5naPpy/8/VXjjgn7eDj9cHYhCnxU4ATptBEOtMm/ZjknuNPcrDcoGeaZOqsLNhb3+Q20bXrfztnR338vtBKmnYQDueCsSZ/w7o3yWw8f4eib9vuFf+13eXIFydBrpo+xI5z74vp775R2/6q3LAs6N8KzH/sJ3uRycPq9uQK1V9mJEU7z8tSpjCnG3h2dg/aOmHg77q1MEFZ3Ej4/2Jt1o8iuicfgEo/Q6XUIR/h+aVp9X/E5U3zx3ndihM77jWJ7pDxGKedOQo/rEQJ4osPKcXaUZZnisodxiTCsw5b+ZU1Wjgg72h16hmGKCJNSW/ohemsyBpe5I6z/PNsVXOwh4pzPzBEWf0rtbukv2/yPuHJmNXOEPLfbYfE4pzR5hGWfUtkxWF+r9TB5hEvhDiuIDMPTP+ZlNPNHuNTrsP4YPG9Jdj/mZVPqMbpS3bcycY2fLhw96a7EnWP7SZcI61yjXHAM7v7oh9vAXbg47bTYHd3YBr+myMZrYo0iLKLaGCSuV4QXD8ODP6tygTYQF+hyTPjlsqdU213fn64O3/3H3vm9tP1AmLMf+G2AjD5Gjiz/7suMpy3RnNpNwsVdiT/K2ntfr2PCzQUfRTbHU3OO36K+SZ4uLzjvQzMv/vBdRtc3ws2pU2uakcipOh4TvsmtwfmsjhG+dtLvy/HMBMkRTfeXvDY1hCb78y1+ScbVocOOL1FwUNuLfi4mQvYVGUEd3vsiQggTITuKjMHN9MNQhAFzP6XOMHeHIgwo/pQqNQY7ECFjKL7leocIM8o+pYzB64kwpmyHZc26xkTIb/XHYPHFe40Ik2bdtPNHRBhWp8PiY/D7Wiqy3t4nwrw6HZY1cYGLCPlSeQx+X7bKS/unRFiCYdhZx3fW1/SRTfv/3njXu5kLEuGQfhqbz9Nyw9WaRFjI8TGln5mIsIoLxpQPfqhJhI0MvTs68QlSZ0fLcZq0G5Owiq99xXdervjI7VK5mAgLOdLJO/f/dkxYkwirOHiQ9mY82itosKPzuRlTPYkQwpwdhTARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhIkQwkQIYSKEMBFCmAghTIQQJkIIEyGEiRDCRAhhIoQwEUKYCCFMhBAmQggTIYSJEMJECGEihDARQpgIIUyEECZCCBMhhP0DDi2aYUEqz6EAAAAASUVORK5CYII=\n",
	"text/plain": [
	"<PIL.Image.Image image mode=RGB size=300x300 at 0x1BA6B5ABDD8>"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"def show_mol(m):\n",
	" DrawingOptions.includeAtomNumbers=True\n",
	" view= Chem.Draw.MolToImage(m)\n",
	" DrawingOptions.includeAtomNumbers=False\n",
	" return view\n",
	"show_mol(mol)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Irrelavant method\n",
	"\n",
	"`rdmolops.FindAllSubgraphsOfLengthN` is not useful this time, because we want to extract linear fragment, not branched fragments."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{7: {10: {}}, 10: {7: {}, 12: {}}, 12: {10: {}, 14: {}}, 14: {12: {}}}\n",
	"<dict_itemiterator object at 0x000001BA6C083C28>\n"
	]
	}
	],
	"source": [
	"import networkx as nx\n",
	"\n",
	"\n",
	"n_len=3 # 3-lengths paths\n",
	"lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)\n",
	"\n",
	"G = nx.Graph()\n",
	"lst=[]\n",
	"for bond_ix in lst_paths[16]:# 1, 16\n",
	" bond=mol.GetBondWithIdx(bond_ix)\n",
	" lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
	"G.add_edges_from(lst)\n",
	"print(G.adj)\n",
	"print(G.adjacency())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[11, 14]\n",
	"[11, 10, 12, 14] [14, 12, 10, 11]\n"
	]
	}
	],
	"source": [
	"def bond_indices_to_nodes(m, bond_indices):\n",
	" G = nx.Graph()\n",
	" lst=[]\n",
	" for bond_ix in bond_indices:\n",
	" bond= m.GetBondWithIdx(bond_ix)\n",
	" lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
	" G.add_edges_from(lst)\n",
	" return G\n",
	"def list_end_points_indices(g):\n",
	" return [k for k,v in g.adjacency()if len(v)==1]\n",
	"def sort_atoms_in_indices(ix_start,g):\n",
	" indices_sorted = []\n",
	" atom_indices = set(g.nodes)\n",
	" atom_indices -= {ix_start}\n",
	" next_ix = ix_start\n",
	" indices_sorted += [next_ix]\n",
	" while any(k in atom_indices for k in list(g.adj[next_ix].keys())):\n",
	" if list(g.adj[next_ix].keys())[0] in atom_indices:\n",
	" next_ix = list(g.adj[next_ix].keys())[0]\n",
	" elif list(g.adj[next_ix].keys())[1] in atom_indices:\n",
	" next_ix = list(g.adj[next_ix].keys())[1]\n",
	" indices_sorted += [next_ix]\n",
	" atom_indices -= {next_ix}\n",
	" return indices_sorted\n",
	"\n",
	"gg = bond_indices_to_nodes(mol,list(lst_paths[-1]))\n",
	"ind = list_end_points_indices(gg)\n",
	"print(ind)\n",
	"print(sort_atoms_in_indices(ind[0], gg),sort_atoms_in_indices(ind[1], gg))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"==========\n",
	"[(0, 1), (1, 3), (3, 4)]\n",
	"start and end [0, 4]\n",
	"function output [0, 1, 3, 4]\n",
	"==========\n",
	"[(1, 3), (3, 4), (4, 5)]\n",
	"start and end [1, 5]\n",
	"function output [1, 3, 4, 5]\n",
	"==========\n",
	"[(1, 3), (3, 4), (9, 4)]\n",
	"start and end [1, 9]\n",
	"function output [1, 3, 4, 9]\n",
	"==========\n",
	"[(1, 2), (1, 3), (3, 4)]\n",
	"start and end [2, 4]\n",
	"function output [2, 1, 3, 4]\n",
	"==========\n",
	"[(3, 4), (4, 5), (5, 6)]\n",
	"start and end [3, 6]\n",
	"function output [3, 4, 5, 6]\n",
	"==========\n",
	"[(3, 4), (9, 4), (8, 9)]\n",
	"start and end [3, 8]\n",
	"function output [3, 4, 9, 8]\n",
	"==========\n",
	"[(4, 5), (5, 6), (6, 7)]\n",
	"start and end [4, 7]\n",
	"function output [4, 5, 6, 7]\n",
	"==========\n",
	"[(9, 4), (8, 9), (7, 8)]\n",
	"start and end [4, 7]\n",
	"function output [4, 9, 8, 7]\n",
	"==========\n",
	"[(4, 5), (9, 4), (8, 9)]\n",
	"start and end [5, 8]\n",
	"function output [5, 4, 9, 8]\n",
	"==========\n",
	"[(5, 6), (6, 7), (7, 8)]\n",
	"start and end [5, 8]\n",
	"function output [5, 6, 7, 8]\n",
	"==========\n",
	"[(5, 6), (6, 7), (7, 10)]\n",
	"start and end [5, 10]\n",
	"function output [5, 6, 7, 10]\n",
	"==========\n",
	"[(5, 6), (4, 5), (9, 4)]\n",
	"start and end [6, 9]\n",
	"function output [6, 5, 4, 9]\n",
	"==========\n",
	"[(6, 7), (7, 8), (8, 9)]\n",
	"start and end [6, 9]\n",
	"function output [6, 7, 8, 9]\n",
	"==========\n",
	"[(6, 7), (7, 10), (10, 11)]\n",
	"start and end [6, 11]\n",
	"function output [6, 7, 10, 11]\n",
	"==========\n",
	"[(6, 7), (7, 10), (10, 12)]\n",
	"start and end [6, 12]\n",
	"function output [6, 7, 10, 12]\n",
	"==========\n",
	"[(7, 10), (10, 12), (12, 13)]\n",
	"start and end [7, 13]\n",
	"function output [7, 10, 12, 13]\n",
	"==========\n",
	"[(7, 10), (10, 12), (12, 14)]\n",
	"start and end [7, 14]\n",
	"function output [7, 10, 12, 14]\n",
	"==========\n",
	"[(7, 8), (7, 10), (10, 11)]\n",
	"start and end [8, 11]\n",
	"function output [8, 7, 10, 11]\n",
	"==========\n",
	"[(7, 8), (7, 10), (10, 12)]\n",
	"start and end [8, 12]\n",
	"function output [8, 7, 10, 12]\n",
	"==========\n",
	"[(8, 9), (7, 8), (7, 10)]\n",
	"start and end [9, 10]\n",
	"function output [9, 8, 7, 10]\n",
	"==========\n",
	"[(10, 11), (10, 12), (12, 13)]\n",
	"start and end [11, 13]\n",
	"function output [11, 10, 12, 13]\n",
	"==========\n",
	"[(10, 11), (10, 12), (12, 14)]\n",
	"start and end [11, 14]\n",
	"function output [11, 10, 12, 14]\n"
	]
	}
	],
	"source": [
	"for p in lst_paths:\n",
	" print('='*10)\n",
	" lst=[]\n",
	" for bond_ix in p:\n",
	" bond = mol.GetBondWithIdx(bond_ix)\n",
	" lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
	" print(lst)\n",
	" gg = bond_indices_to_nodes(mol, list(p))\n",
	" ind = list_end_points_indices(gg)\n",
	" print('start and end', ind)\n",
	" print('function output',sort_atoms_in_indices(ind[0],gg))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"def generate_linear_fragments(mol, n_len=3):\n",
	" lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)\n",
	" smiles = []\n",
	" lst_indices = []\n",
	" for p in lst_paths:\n",
	" lst=[]\n",
	" for bond_ix in p:\n",
	" bond = mol.GetBondWithIdx(bond_ix)\n",
	" lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
	" gg = bond_indices_to_nodes(mol, list(p))\n",
	" ind = list_end_points_indices(gg)\n",
	" atom_indices = list(sort_atoms_in_indices(ind[0],gg))\n",
	" lst_indices += [atom_indices]\n",
	" smi=''\n",
	" for ix in range(n_len):\n",
	" smi += mol.GetAtomWithIdx(atom_indices[ix]).GetSymbol()\n",
	" bond = mol.GetBondBetweenAtoms(atom_indices[ix],atom_indices[ix+1]) \n",
	" if bond.GetBondType() is Chem.BondType.SINGLE:\n",
	" smi += '-'# explicitly shown, compared to aromatic bond.\n",
	" elif bond.GetBondType() is Chem.BondType.DOUBLE:\n",
	" smi += '='\n",
	" elif bond.GetBondType() is Chem.BondType.TRIPLE:\n",
	" smi += '#'\n",
	" elif bond.GetBondType() is Chem.BondType.QUADRUPLE:\n",
	" smi += '$'\n",
	" else:\n",
	" smi += mol.GetAtomWithIdx(atom_indices[ix+1]).GetSymbol()\n",
	" smiles += [smi]\n",
	" return smiles, lst_indices"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[[0, 1, 3, 4, 5, 6],\n",
	" [0, 1, 3, 4, 9, 8],\n",
	" [1, 3, 4, 5, 6, 7],\n",
	" [1, 3, 4, 9, 8, 7],\n",
	" [2, 1, 3, 4, 5, 6],\n",
	" [2, 1, 3, 4, 9, 8],\n",
	" [3, 4, 5, 6, 7, 8],\n",
	" [3, 4, 5, 6, 7, 10],\n",
	" [3, 4, 9, 8, 7, 6],\n",
	" [3, 4, 9, 8, 7, 10],\n",
	" [4, 5, 6, 7, 8, 9],\n",
	" [4, 5, 6, 7, 10, 11],\n",
	" [4, 5, 6, 7, 10, 12],\n",
	" [4, 9, 8, 7, 6, 5],\n",
	" [4, 9, 8, 7, 10, 11],\n",
	" [4, 9, 8, 7, 10, 12],\n",
	" [5, 4, 9, 8, 7, 6],\n",
	" [5, 4, 9, 8, 7, 10],\n",
	" [5, 6, 7, 10, 12, 13],\n",
	" [5, 6, 7, 10, 12, 14],\n",
	" [6, 5, 4, 9, 8, 7],\n",
	" [7, 6, 5, 4, 9, 8],\n",
	" [8, 7, 6, 5, 4, 9],\n",
	" [9, 4, 5, 6, 7, 10],\n",
	" [9, 8, 7, 10, 12, 13],\n",
	" [9, 8, 7, 10, 12, 14]]\n"
	]
	}
	],
	"source": [
	"from pprint import pprint\n",
	"for l in range(1, 6):\n",
	" smis, atom_ixs = generate_linear_fragments(mol, l)\n",
	"else:\n",
	" pprint(atom_ixs)\n",
	"# print('length items:', l,len(smis))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['C-C-C-CCC', 'C-C-C-CCC', 'C-C-CCCC', 'C-C-CCCC', 'C-C-C-CCC']"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"smis, atom_ixs = generate_linear_fragments(mol, l)\n",
	"smis[:5]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"==========\n",
	"SINGLE\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCCC\n",
	"==========\n",
	"SINGLE\n",
	"SINGLE\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"SINGLE\n",
	"SINGLE\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"SINGLE\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCCC\n",
	"==========\n",
	"SINGLE\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"SINGLE\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"SINGLE\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCCC\n",
	"==========\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCC=O\n",
	"==========\n",
	"SINGLE\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCCO\n",
	"==========\n",
	"AROMATIC\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCCC\n",
	"==========\n",
	"AROMATIC\n",
	"AROMATIC\n",
	"SINGLE\n",
	"CCCC\n",
	"==========\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCC=O\n",
	"==========\n",
	"SINGLE\n",
	"SINGLE\n",
	"SINGLE\n",
	"CCCO\n"
	]
	}
	],
	"source": [
	"n_len=3 # 3-lengths paths\n",
	"lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)\n",
	"for p in lst_paths:\n",
	" print('='*10)\n",
	" lst=[]\n",
	" for bond_ix in p:\n",
	" bond = mol.GetBondWithIdx(bond_ix)\n",
	" lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]\n",
	" gg = bond_indices_to_nodes(mol, list(p))\n",
	" ind = list_end_points_indices(gg)\n",
	" atom_indices = list(sort_atoms_in_indices(ind[0],gg))\n",
	" \n",
	" smi=''\n",
	" for ix in range(n_len):\n",
	" smi += mol.GetAtomWithIdx(atom_indices[ix]).GetSymbol()\n",
	" bond = mol.GetBondBetweenAtoms(atom_indices[ix],atom_indices[ix+1]) \n",
	" if bond.GetBondType() is Chem.BondType.DOUBLE:\n",
	" smi += '='\n",
	" elif bond.GetBondType() is Chem.BondType.TRIPLE:\n",
	" smi += '#'\n",
	" else:\n",
	" print(bond.GetBondType())\n",
	" else:\n",
	" smi += mol.GetAtomWithIdx(atom_indices[ix+1]).GetSymbol()\n",
	" print(smi)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Proof that GetShortestPath is not the best in this case\n",
	"\n",
	"The code below using `FindAllPathsOfLengthN` is also not the best, because atom indices in `tup_atom_inter_ix` are randomly aligned."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0 5\n",
	"0 1 3 4 5\n",
	"0 1 3 4 5\n",
	"0 9\n",
	"0 1 3 4 9\n",
	"0 1 3 4 9\n",
	"1 6\n",
	"1 3 4 5 6\n",
	"1 3 4 5 6\n",
	"1 8\n",
	"1 9 3 4 8\n",
	"1 3 4 9 8\n",
	"2 5\n",
	"2 1 3 4 5\n",
	"2 1 3 4 5\n",
	"2 9\n",
	"2 1 3 4 9\n",
	"2 1 3 4 9\n",
	"3 7\n",
	"3 4 5 6 7\n",
	"3 4 5 6 7\n",
	"3 7\n",
	"3 8 9 4 7\n",
	"3 4 5 6 7\n",
	"4 8\n",
	"4 5 6 7 8\n",
	"4 9 8\n",
	"4 10\n",
	"4 5 6 7 10\n",
	"4 5 6 7 10\n",
	"4 6\n",
	"4 8 9 7 6\n",
	"4 5 6\n",
	"4 10\n",
	"4 8 9 7 10\n",
	"4 5 6 7 10\n",
	"5 7\n",
	"5 8 9 4 7\n",
	"5 6 7\n",
	"5 9\n",
	"5 8 6 7 9\n",
	"5 4 9\n",
	"5 11\n",
	"5 10 6 7 11\n",
	"5 6 7 10 11\n",
	"5 12\n",
	"5 10 6 7 12\n",
	"5 6 7 10 12\n",
	"6 8\n",
	"6 9 4 5 8\n",
	"6 7 8\n",
	"6 13\n",
	"6 10 12 7 13\n",
	"6 7 10 12 13\n",
	"6 14\n",
	"6 10 12 7 14\n",
	"6 7 10 12 14\n",
	"7 9\n",
	"7 4 5 6 9\n",
	"7 8 9\n",
	"8 13\n",
	"8 10 12 7 13\n",
	"8 7 10 12 13\n",
	"8 14\n",
	"8 10 12 7 14\n",
	"8 7 10 12 14\n",
	"9 11\n",
	"9 8 10 7 11\n",
	"9 8 7 10 11\n",
	"9 12\n",
	"9 8 10 7 12\n",
	"9 8 7 10 12\n"
	]
	}
	],
	"source": [
	"n_len=4\n",
	"lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,\n",
	" useBonds=True,\n",
	" useHs=True)\n",
	"for pathset in lst_paths:\n",
	" tup_atom_inter_ix = {mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx()\n",
	" for bond_ix in pathset[1:-1]} \| {\n",
	" mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()\n",
	" for bond_ix in pathset[1:-1]}\n",
	" bond_ix = pathset[0]\n",
	" atom_ix_start={mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx(),\n",
	" mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()}-tup_atom_inter_ix\n",
	"\n",
	" bond_ix = pathset[-1]\n",
	" atom_ix_end={mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx(),\n",
	" mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()}-tup_atom_inter_ix\n",
	" print(list(atom_ix_start)[0],list(atom_ix_end)[0])\n",
	" print(list(atom_ix_start)[0],*list(tup_atom_inter_ix),list(atom_ix_end)[0])\n",
	" print(*rdmolops.GetShortestPath(mol,\n",
	" list(atom_ix_start)[0],list(atom_ix_end)[0]))\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}