hypnopump/chemnlp_rhea_x_uniprot_parsing.ipynb

## chemnlp_rhea_x_uniprot_parsing.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1b9a470e-06a7-4081-b6f0-baad4aa27f4a",
   "metadata": {
    "tags": []
   },
   "source": [
    "# OpenBioML - ChemNLP Rhea x Uniprot Parsing\n",
    "\n",
    "by [@hypnopump](https://github.com/hypnopump)\n",
    "\n",
    "This script contains a fair degree of string hacking and non-standard integration of different sources in order to achieve the smallest error rate possible. Sources for all downloads are specified."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52484b2e-0a7c-4514-ae9e-9ab8301e55a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "import subprocess as sp\n",
    "from rdkit import Chem\n",
    "from rdkit import RDLogger\n",
    "\n",
    "import requests\n",
    "from typing import Optional, Union\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "from functools import lru_cache\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "068db5a6-568b-4b4a-a1f9-332fc0dc5387",
   "metadata": {},
   "outputs": [],
   "source": [
    "RDLogger.DisableLog(\"rdApp.*\")\n",
    "\n",
    "BASE_PATH = Path(\"Downloads\")\n",
    "HTML_PATH = BASE_PATH / \"rhea_html\"\n",
    "JSON_PATH = BASE_PATH / \"rhea_json\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "79f1b1d3-8ff1-40be-9c8f-80997eeb6e7a",
   "metadata": {
    "tags": []
   },
   "source": [
    "### ChEBI ID to molecules \n",
    "* This covers many molecules and avoid extra HTTP requests. \n",
    "* Seems it still fails sometimes, especially for secondary IDs, fragments, etc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f154a4ed-27ce-48fb-9273-2d79b5256291",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# only contains ChEBI ids with valid InChis. \n",
    "# !wget https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chebiId_inchi.tsv\n",
    "chebi_inchis = pd.read_csv(BASE_PATH / \"chebiId_inchi.tsv\", sep=\"\\t\")\n",
    "chebi_inchis[\"smiles\"] = [Chem.MolToSmiles(Chem.MolFromInchi(inchi, sanitize=False)) for inchi in tqdm(chebi_inchis.InChI)]\n",
    "\n",
    "ID2SMILES = {row[\"CHEBI_ID\"]: row[\"smiles\"] for i,row in chebi_inchis.iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f6f08af-ceff-42a6-9f0d-6aaa714f9208",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from https://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz\n",
    "filename = BASE_PATH / \"rhea.rdf\"\n",
    "\n",
    "reacts = {\n",
    "    \"rhea_id\": [],\n",
    "    \"equation\": [],\n",
    "}\n",
    "\n",
    "# name, ChEBI_id\n",
    "compounds_name2chebi = {} \n",
    "\n",
    "with open(filename, \"r\") as f: \n",
    "    lines = f.readlines()\n",
    "    \n",
    "for i, line in enumerate(lines): \n",
    "    if \"<rh:equation>\" in line: \n",
    "        # i-1 = label, i-2: id\n",
    "        id_ = lines[i-2].split(\"<rh:accession>\")[-1].split(\"</rh:accession>\")[0].split(\":\")[-1]\n",
    "        eq = line.split(\"<rh:equation>\")[-1].split(\"</rh:equation>\")[0]\n",
    "        eq = eq.replace(\"&gt;\", \">\").replace(\"&lt;\", \"<\")\n",
    "        reacts[\"rhea_id\"].append(id_)  \n",
    "        reacts[\"equation\"].append(eq)\n",
    "        \n",
    "    if \"<rh:accession>CHEBI\" in line: \n",
    "        chem_name = lines[i+1].split(\"<rh:name>\")[1].split(\"</rh:name>\")[0]\n",
    "        chebi_id = line.split(\"<rh:accession>CHEBI:\")[1].split(\"</rh:accession>\")[0]\n",
    "        compounds_name2chebi[chem_name] = int(chebi_id)\n",
    "\n",
    "compounds_chebi2name = {v:k for k,v in compounds_name2chebi.items()} "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d768b6da-4e23-437b-b775-03527878281f",
   "metadata": {},
   "outputs": [],
   "source": [
    "react_ids = pd.DataFrame(reacts)\n",
    "\n",
    "react_ids[\"num_reacts\"] = [eq.split(\"=\")[0].count(\" + \") + 1 for eq in react_ids.equation]\n",
    "react_ids[\"num_prods\"] = [eq.split(\"=\")[-1].count(\" + \") + 1 for eq in react_ids.equation]\n",
    "\n",
    "react_ids.max(); # 11 is max num prod/reacts\n",
    "max_num = max(react_ids.max()[[\"num_reacts\", \"num_prods\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b9456e4-950f-4476-b384-e649168d811b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# separate reactants and products\n",
    "reacts_prods = {\n",
    "    **{f\"react_{i}\": [] for i in range(max_num)},\n",
    "    **{f\"prod_{i}\": [] for i in range(max_num)},\n",
    "    **{f\"react_{i}_stech_coef\": [] for i in range(max_num)},\n",
    "    **{f\"prod_{i}_stech_coef\": [] for i in range(max_num)},\n",
    "}\n",
    "splitters = (\" = \", \" => \", \" <=> \",  \" <= \")\n",
    "\n",
    "for i, row in react_ids.iterrows():\n",
    "    splitted = False\n",
    "    for splitter in splitters: \n",
    "        if splitter in row.equation: \n",
    "            reacts, prods = row.equation.split(splitter)\n",
    "            splitted = True\n",
    "            break\n",
    "    if not splitted: \n",
    "        print(i, row.equation)\n",
    "            \n",
    "    reacts = reacts.split(\" + \")\n",
    "    prods = prods.split(\" + \")\n",
    "    for n in range(max_num):\n",
    "        for side, name in zip((reacts, prods), (\"react\", \"prod\")):\n",
    "            if len(side) <= n:\n",
    "                stech, code = \"\", \"\"\n",
    "            else: \n",
    "                # try to get the stechiometric coefficient\n",
    "                if \" \" in side[n]: \n",
    "                    stech = side[n].split(\" \")[0]\n",
    "                    code = \" \".join(side[n].split(\" \")[1:])\n",
    "                    try:\n",
    "                        stech = float(stech)\n",
    "                    except ValueError: \n",
    "                        stech, code = \"1\", side[n]\n",
    "                else: \n",
    "                    stech, code = \"1\", side[n]\n",
    "\n",
    "                # remove (in, out) marks from transporters\n",
    "                code = code.replace(\"(in)\", \"\").replace(\"(out)\", \"\")\n",
    "                \n",
    "            if code == \"2 dGDP\": \n",
    "                \"a\"+9\n",
    "            reacts_prods[f\"{name}_{n}\"].append(code)\n",
    "            reacts_prods[f\"{name}_{n}_stech_coef\"].append(stech)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ff37be4-7954-405a-b4e9-711843162859",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# not all chebi ids are in the chebi downloaded file with names; query manually\n",
    "failures = set(compounds_chebi2name.keys()) - set(chebi_inchis[\"CHEBI_ID\"])\n",
    "fail_rate = len(failures) / len(compounds_name2chebi)\n",
    "f\"Names whose smiles are not found: {len(failures)} / {len(compounds_name2chebi)} = {fail_rate * 100:.3f}%\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9db83249-c548-42c0-a2d4-d0c1d20cce90",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "new_mols = {\"CHEBI_ID\": [], \"InChI\": [], \"smiles\": []}\n",
    "for f, failure in enumerate(failures): \n",
    "    if failure: # not empty str\n",
    "        mol = save_chebiid_mol(failure)\n",
    "        if mol is not None: \n",
    "            new_mols[\"CHEBI_ID\"].append(failure)\n",
    "            new_mols[\"smiles\"].append(Chem.MolToSmiles(mol))\n",
    "            # probs does not have an inchi key (fragment, etc)\n",
    "            new_mols[\"InChI\"].append(Chem.inchi.MolToInchi(mol))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d47cd7f-3fe8-4f92-b82d-0e3d1b3ea5d6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "new_chebi_inchis = pd.concat((chebi_inchis, pd.DataFrame(new_mols)), axis=\"rows\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4e50707-d91f-4470-af87-414068d7f2e4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "new_chebi_inchis[\"name\"] = [compounds_chebi2name.get(id_, \"\") for id_ in new_chebi_inchis.CHEBI_ID]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf81997f-8634-4123-acf3-06ef5cb18440",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read smiles and ChEBI ids.\n",
    "# https://ftp.expasy.org/databases/rhea/tsv/chebiId_name.tsv\n",
    "chebi2name = pd.read_csv(BASE_PATH / \"chebiId_name.tsv\", sep=\"\\t\", header=None, names=[\"chebi\", \"name\"])\n",
    "# All Rhea TSV files (except UniProtKB) can be downloaded in a single archive: rhea-tsv.tar.gz\n",
    "# See https://www.rhea-db.org/help/download\n",
    "chebi2smiles = pd.read_csv(BASE_PATH / \"rhea-chebi-smiles.tsv\", sep=\"\\t\", header=None, names=[\"chebi\", \"smiles\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4910d110-6936-4f04-8f91-e0f57d423617",
   "metadata": {},
   "outputs": [],
   "source": [
    "chebi2info = pd.merge(chebi2name, chebi2smiles, on=\"chebi\")\n",
    "chebi2info.name = chebi2info.name.apply(lambda x: x.rstrip(\" \").lstrip(\" \"))\n",
    "chebi2info.smiles = chebi2info.smiles.apply(lambda x: x.rstrip(\" \").lstrip(\" \"))\n",
    "# some points missed!\n",
    "chebi2info.shape, chebi2name.shape, chebi2smiles.shape;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "475c3da0-0040-4bb3-ada3-cf041d02bbbd",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "chebi2info[\"chebi\"] = [int(chebi_id.split(\"CHEBI:\")[-1]) for chebi_id in chebi2info[\"chebi\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c56cd81d-9bcd-4c67-abfc-c50021ce26ea",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "new_from_rhea = chebi2info[~chebi2info.chebi.isin(set(new_chebi_inchis.CHEBI_ID))].rename(columns={\"chebi\": \"CHEBI_ID\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd6cd17f-aca0-48da-aedd-bc48f7cd1bde",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "new_from_rhea[\"InChI\"] = None\n",
    "new_chebi_inchis = pd.concat((new_chebi_inchis, new_from_rhea), axis=\"rows\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12f8cf7a-31ed-49f6-9632-6804be7f3994",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "react_ids = pd.DataFrame(reacts)\n",
    "\n",
    "react_ids[\"num_reacts\"] = [eq.split(\"=\")[0].count(\" + \") + 1 for eq in react_ids.equation]\n",
    "react_ids[\"num_prods\"] = [eq.split(\"=\")[-1].count(\" + \") + 1 for eq in react_ids.equation]\n",
    "\n",
    "react_ids.max(); # 11 is max num prod/reacts\n",
    "max_num = max(react_ids.max()[[\"num_reacts\", \"num_prods\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fca8708d-a667-4991-93a1-0d4f9da82d71",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# separate reactants and products\n",
    "reacts_prods = {\n",
    "    **{f\"react_{i}\": [] for i in range(max_num)},\n",
    "    **{f\"prod_{i}\": [] for i in range(max_num)},\n",
    "    **{f\"react_{i}_stech_coef\": [] for i in range(max_num)},\n",
    "    **{f\"prod_{i}_stech_coef\": [] for i in range(max_num)},\n",
    "}\n",
    "splitters = (\" = \", \" => \", \" <=> \",  \" <= \")\n",
    "\n",
    "for i, row in react_ids.iterrows():\n",
    "    splitted = False\n",
    "    for splitter in splitters: \n",
    "        if splitter in row.equation: \n",
    "            reacts, prods = row.equation.split(splitter)\n",
    "            splitted = True\n",
    "            break\n",
    "    if not splitted: \n",
    "        print(i, row.equation)\n",
    "            \n",
    "    reacts = reacts.split(\" + \")\n",
    "    prods = prods.split(\" + \")\n",
    "    for n in range(max_num):\n",
    "        for side, name in zip((reacts, prods), (\"react\", \"prod\")):\n",
    "            if len(side) <= n:\n",
    "                stech, code = \"\", \"\"\n",
    "            else: \n",
    "                # try to get the stechiometric coefficient\n",
    "                if \" \" in side[n]: \n",
    "                    stech = side[n].split(\" \")[0]\n",
    "                    code = \" \".join(side[n].split(\" \")[1:])\n",
    "                    try:\n",
    "                        stech = float(stech)\n",
    "                    except ValueError: \n",
    "                        stech, code = \"1\", side[n]\n",
    "                else: \n",
    "                    stech, code = \"1\", side[n]\n",
    "\n",
    "                # remove (in, out) marks from transporters\n",
    "                code = code.replace(\"(in)\", \"\").replace(\"(out)\", \"\")\n",
    "                \n",
    "            if code == \"2 dGDP\": \n",
    "                \"a\"+9\n",
    "            reacts_prods[f\"{name}_{n}\"].append(code)\n",
    "            reacts_prods[f\"{name}_{n}_stech_coef\"].append(stech)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "420bf316-2f10-4177-8531-3006416c3eaf",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "react_data = pd.DataFrame(reacts_prods)\n",
    "merged = pd.concat((react_ids.iloc[:react_data.shape[0]], react_data), axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eef898ff-0413-4fc6-a29f-a61f115fef6d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "res = set(merged[[f\"react_{i}\" for i in range(max_num)]].values.flatten().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d8e4bb8-0025-4b9d-b409-80f83850a472",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "fails = res - set(compounds_chebi2name.values()) - set(compounds_generic2name.values()) - {\"\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d7b00b2-81e9-47c7-adc6-9d13e5c9dfe7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "f\"Names whose smiles are not found: {len(fails)} / {len(res)} = {len(fails) / len(res) * 100:.3f}%\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "471f4601-3b00-44a3-9070-053364495f97",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "(merged[[f\"react_{i}\" for i in range(max_num)]] == \"(1->3)-alpha-D-glucosyl-[(1->6)-alpha-D-glucosyl](n)\").sum(axis=1).argmax()\n",
    "merged.iloc[44902]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a1d6921-b32d-4594-9707-a5a73887c47b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# will have to manually scrape the remaining 5%: does not appear in files. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a2aa7ff-8b32-4364-a8b0-e00ab972ff6c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "has_failure = merged[[f\"react_{i}\" for i in range(max_num)]].isin(fails).sum(axis=1)\n",
    "failed_reacts = merged[has_failure.values.astype(bool)].rhea_id.values.astype(int).tolist()\n",
    "f\"Failed reactions {failed_reacts.shape[0]} / {merged.shape[0]} = {failed_reacts.shape[0] / merged.shape[0] * 100:.3f}% \""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "123e24fa-d98e-4356-b162-7cac475fd457",
   "metadata": {},
   "source": [
    "### Rhea reaction parsing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f385bfb-d021-4cd3-b770-78223573030d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "@lru_cache\n",
    "def save_chebiid_smiles(chebi_id: str) -> Chem.Mol: \n",
    "    \"\"\" Tries to get a molecule object from a chebi id \"\"\"\n",
    "    smiles = ID2SMILES.get(chebi_id, None)\n",
    "    if smiles is None:\n",
    "        with urllib.request.urlopen(f\"https://www.ebi.ac.uk/chebi/saveStructure.do?sdf=true&chebiId={chebi_id}&imageId=0\") as f:\n",
    "            molblock = f.read().decode(\"latin-1\")\n",
    "        molblock = molblock[1:] if molblock.startswith(\"\\n\\n\") else molblock\n",
    "        mol = Chem.MolFromMolBlock(molblock, sanitize=False, removeHs=False)\n",
    "        if mol is not None: \n",
    "            smiles = Chem.MolToSmiles(mol)\n",
    "        else: \n",
    "            print(f\"ID: {chebi_id} failed mol processing. Trying scrapping\")\n",
    "            \n",
    "            # retrieval from mol failed, try pure scraping\n",
    "            try: \n",
    "                with urllib.request.urlopen(f\"https://www.ebi.ac.uk/chebi/searchId.do?chebiId={chebi_id}\") as f:\n",
    "                    html = f.read().decode(\"latin-1\").split(\"\\n\")\n",
    "                    for i, line in enumerate(html): \n",
    "                        if '<td class=\"chebiDataHeader\" style=\"width: 150px; height: 15px;\">SMILES</td>' in line: \n",
    "                            line = html[i+1].lstrip(\" \").lstrip(\"\\t\").lstrip(\" \").lstrip(\"\\t\")\n",
    "                            soup = BeautifulSoup(line, 'html.parser')\n",
    "                            smiles = soup.text\n",
    "                \n",
    "            except Exception as e:\n",
    "                print(f\"Error while scrapping: {e}. Setting smiles to empty for CHEBI:{id_}\")\n",
    "                smiles = \"\" \n",
    "        \n",
    "    return smiles\n",
    "\n",
    "def parse_rhea_id(id_: int, filepath: Optional[Union[Path, str]] = None, encoding: str = \"latin-1\") -> tuple[str, dict[int, str]]: \n",
    "    \"\"\" Loads a reaction page from Rhea and parses its reactants and products. \n",
    "    # !wget https://www.rhea-db.org/rhea/57036 -O downloads/57036.txt\n",
    "    Inputs: \n",
    "    * id_: int. rhea identifier. \n",
    "    * filepath: Path or str. Path to the Rhea HTML for a given id.  \n",
    "    Outputs: equation, {chebi_id: mol_name}\n",
    "    \"\"\"\n",
    "    if filepath is not None: \n",
    "        with open(filepath, \"r\", encoding=encoding) as f: \n",
    "            html = f.read()\n",
    "        \n",
    "        # bad encoding sometimes\n",
    "        if not 'href=\"http://www.ebi.ac.uk/' in html: \n",
    "            print(f\"Loaded content from filepath is useless. {filepath}\")\n",
    "            filepath = None\n",
    "            \n",
    "    if filepath is None: \n",
    "        try: \n",
    "            with urllib.request.urlopen(f\"https://www.rhea-db.org/rhea/{id_}\") as f:\n",
    "                html = f.read().decode(encoding)\n",
    "        except urllib.error.HTTPError as e:\n",
    "            print(f\"Rhea id failed url access: {id_}\")\n",
    "            return \"\", {}\n",
    "                    \n",
    "    lines = html.split(\"\\n\")\n",
    "    # remove trashy line starts and empty ones\n",
    "    # lines = list(map(lambda x: x.lstrip(\" \").lstrip(\"\\t\").lstrip(\" \").lstrip(\"\\t\").lstrip(\" \").lstrip(\"\\t\").lstrip(\" \"), lines))\n",
    "    lines = list(filter(len, lines))\n",
    "\n",
    "    chebi2mol = {}\n",
    "    eq_filled = False\n",
    "    for i, line in enumerate(lines):\n",
    "        # get chebi id\n",
    "        if 'href=\"http://www.ebi.ac.uk/' in line: \n",
    "            chebi_id = line.split('CHEBI:')[-1].split(\"<\")[0]\n",
    "            chebi2mol[int(chebi_id)] = {\"mol_name\": mol_name}\n",
    "        # get mol names\n",
    "        if 'onclick=\"window.attachToolTip(this)\" class=\"molName\" data-molid=' in line: \n",
    "            soup = BeautifulSoup(line, 'html.parser')\n",
    "            mol_name = soup.a.text\n",
    "        \n",
    "        # get equation string\n",
    "        if '<textarea readonly style=\"transform:scale(0,0)\" id=\"equationtext\">' in line: \n",
    "            if not eq_filled: \n",
    "                soup = BeautifulSoup(line, 'html.parser')\n",
    "                eq = soup.text.lstrip(\" \").rstip(\" \")\n",
    "                eq_filled = True \n",
    "                \n",
    "    return eq, chebi2mol\n",
    "\n",
    "\n",
    "def parse_rhea_id_with_smiles(rhea_id: int, filepath: Optional[Union[Path, str]] = None,  **kwargs) -> tuple[int, str, dict[int, tuple[str, str]]]: \n",
    "    \"\"\" Parses a full Rhea reaction to get the Rhea ID, the reaction \n",
    "        equation and compounds. \n",
    "        Inputs: \n",
    "        * rhea_id: int. id for a reaction\n",
    "        * filepath: Path or str. Path to the Rhea HTML for a given id. \n",
    "        Outputs: rhea_id, reaction_equation, {chebi_id: (cpd_name,smiles)} \n",
    "    \"\"\"\n",
    "    try: \n",
    "        eq_, cpds = parse_rhea_id(rhea_id, filepath=filepath, **kwargs)\n",
    "    except Exception as e: \n",
    "        raise ValueError(f\"ID: {rhea_id} failed scrapping: {e}\")\n",
    "        \n",
    "    new_cpds = {}\n",
    "    for id_, cpd in cpds.items(): \n",
    "        # id for a photon, electron\n",
    "        if id_ in {30212, 10545}: \n",
    "            smiles = \"\"\n",
    "        else: \n",
    "            smiles = save_chebiid_smiles(id_)\n",
    "        new_cpds[id_] = (cpd, smiles)\n",
    "    return rhea_id, eq_, new_cpds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d72bfd0-d12d-494a-84a6-96eaa681b936",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# single test\n",
    "id_ = 29283\n",
    "pre_react = parse_rhea_id(id_=id_)\n",
    "full_react = parse_rhea_id_with_smiles(rhea_id=id_)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c96bb9e3-4715-41f0-ab90-f3f549cf6ae2",
   "metadata": {},
   "source": [
    "### Prepare full download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd136aff-0ee6-4226-b90f-977681fc555f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "all_ids = merged.rhea_id.values.astype(int).tolist() # [:100]\n",
    "all_urls = [f\"https://www.rhea-db.org/rhea/{id_}\" for id_ in all_ids]\n",
    "# download_multiple(all_urls, route=HTML_PATH, max_concurrent=32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4751f41-9213-4672-8d84-e248c53b0a71",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(f\"Prev length: {len(all_ids)}\")\n",
    "all_ids = list(filter(lambda x: Path(f\"{HTML_PATH}/{x}\").is_file(), all_ids))\n",
    "print(f\"Post length: {len(all_ids)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "686cb2ea-ca49-4b58-944e-6385bfde71c1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "step = 2000\n",
    "total_tac = time.time()\n",
    "for i in range(58000, len(all_ids), step): \n",
    "    tac = time.time()\n",
    "    # results = Parallel(n_jobs=-1)(delayed(parse_rhea_id_with_smiles)(id_, ) for id_ in tqdm(all_ids[i:i+step]))\n",
    "    # results = list(results)\n",
    "    results = [parse_rhea_id_with_smiles(id_, filepath=Path(f\"Downloads/rhea_html/{id_}\")) for id_ in tqdm(all_ids[i:i+step])]\n",
    "    results = [{\"rhea_id\": x[0], \"equation\": x[1], \"compounds\": x[2]} for x in results]\n",
    "    tic = time.time()\n",
    "    with open(f\"Downloads/rhea_json/rhea_{i}_{step+i}.json\", \"w\") as f:\n",
    "        json.dump(results, f)\n",
    "    print(i, f\"took: {tic - tac:.3f} seconds for {step} queries. So far: {tic - total_tac:.3f}s: {len(all_ids) - i} remaining\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d50a1444-2cfd-41f6-8a08-5ec90a1593f2",
   "metadata": {},
   "source": [
    "### Save to json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a8b7df0-6e6e-427d-9c29-db4196298426",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "jsons = []\n",
    "for json_file in (BASE_PATH/\"rhea_json\").iterdir(): \n",
    "    content = json.loads(open(json_file, \"r\", encoding=\"latin-1\").read())\n",
    "    jsons.extend(content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6caeb66d-1f55-4bfa-8735-a4003710ef86",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "with open(BASE_PATH / \"rhea_json\" / \"parsed_rhea.json\", \"w\") as f:\n",
    "    for item in jsons:\n",
    "        item[\"equation\"] = item[\"equation\"]\n",
    "        json.dump(item, f)\n",
    "        f.write(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dea37bd2-58e2-46e8-8c4f-39a8fcb9d7fe",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "assistant",
   "language": "python",
   "name": "assistant"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}