Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Created April 7, 2024 17:24
Show Gist options
  • Save PatWalters/655d2ce569585984feae6820d27d4631 to your computer and use it in GitHub Desktop.
Save PatWalters/655d2ce569585984feae6820d27d4631 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"source": [
"A simple notebook demonstrating how to calculate a 95% confidence interval for an AUC "
],
"metadata": {
"collapsed": false
},
"id": "7129e9ea5e9a73ee"
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ce245b04",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:12:09.977098Z",
"start_time": "2024-04-07T17:12:09.189458Z"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import useful_rdkit_utils as uru\n",
"from lightgbm import LGBMClassifier\n",
"from rdkit import Chem\n",
"from sklearn.metrics import roc_auc_score\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.utils import resample"
]
},
{
"cell_type": "markdown",
"id": "c91dc491",
"metadata": {},
"source": [
"A couple of convenience functions"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a94bef8a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:19:22.603515Z",
"start_time": "2024-04-07T17:19:22.595846Z"
}
},
"outputs": [],
"source": [
"def smiles_to_molecular_weight(smiles):\n",
" mol = Chem.MolFromSmiles(smiles)\n",
" return Chem.Descriptors.MolWt(mol)\n",
"\n",
"\n",
"def log_ug_ml_to_logS(ug, molecular_weight):\n",
" grams = 10**ug * 1e-6\n",
" moles = grams / molecular_weight\n",
" return np.log10(moles * 1000)"
]
},
{
"cell_type": "markdown",
"id": "2706abc5",
"metadata": {},
"source": [
"Read the data from [Prospective Validation of Machine Learning Algorithms for Absorption, Distribution, Metabolism, and Excretion Prediction: An Industrial Perspective](http://dx.doi.org/10.1021/acs.jcim.3c00160)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "e8c0220a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:19:29.917269Z",
"start_time": "2024-04-07T17:19:29.735452Z"
}
},
"outputs": [],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/molecularinformatics/Computational-ADME/main/ADME_public_set_3521.csv\")"
]
},
{
"cell_type": "markdown",
"id": "cb8d2db7",
"metadata": {},
"source": [
"Extract the solubility data"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b9bd37ac",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:19:31.478655Z",
"start_time": "2024-04-07T17:19:31.474273Z"
}
},
"outputs": [],
"source": [
"sol_df = df.dropna(subset=\"LOG SOLUBILITY PH 6.8 (ug/mL)\")[[\"SMILES\", \"Internal ID\", \"LOG SOLUBILITY PH 6.8 (ug/mL)\"]]\n",
"sol_df.columns = [\"SMILES\", \"Name\", \"log_sol_ug_ml\"]"
]
},
{
"cell_type": "markdown",
"id": "2dd63e56",
"metadata": {},
"source": [
"Remove data without measured values"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "77cad2fa",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:19:33.192093Z",
"start_time": "2024-04-07T17:19:33.187201Z"
}
},
"outputs": [],
"source": [
"sol_df = sol_df.query(\"log_sol_ug_ml > 0\").copy()"
]
},
{
"cell_type": "markdown",
"id": "64731b76",
"metadata": {},
"source": [
"I don't relate to data in ug/ml, so I'm going to convert to the log of molar solubility (logS)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "78412674",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:19:45.909771Z",
"start_time": "2024-04-07T17:19:45.769494Z"
}
},
"outputs": [],
"source": [
"sol_df['mw'] = sol_df.SMILES.apply(smiles_to_molecular_weight)\n",
"sol_df['logS'] = [log_ug_ml_to_logS(a, b) for a, b in sol_df[[\"log_sol_ug_ml\", \"mw\"]].values]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "b229d06a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:19:47.822466Z",
"start_time": "2024-04-07T17:19:47.816954Z"
}
},
"outputs": [
{
"data": {
"text/plain": " SMILES Name log_sol_ug_ml \\\n0 CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H... Mol1 0.089905 \n1 CCOc1cc2nn(CCC(C)(C)O)cc2cc1NC(=O)c1cccc(C(F)F)n1 Mol2 0.550228 \n3 CC(C)(Oc1ccc(-c2cnc(N)c(-c3ccc(Cl)cc3)c2)cc1)C... Mol4 1.657056 \n5 CC#CC(=O)N[C@H]1CCCN(c2c(F)cc(C(N)=O)c3[nH]c(C... Mol6 1.033424 \n8 C=CC(=O)N1CCC[C@@H](n2nc(-c3ccc(Oc4ccccc4)cc3)... Mol9 0.933990 \n\n mw logS \n0 434.435 -5.548020 \n1 418.444 -5.071409 \n3 382.847 -3.925969 \n5 370.428 -4.535280 \n8 440.507 -4.709963 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>SMILES</th>\n <th>Name</th>\n <th>log_sol_ug_ml</th>\n <th>mw</th>\n <th>logS</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...</td>\n <td>Mol1</td>\n <td>0.089905</td>\n <td>434.435</td>\n <td>-5.548020</td>\n </tr>\n <tr>\n <th>1</th>\n <td>CCOc1cc2nn(CCC(C)(C)O)cc2cc1NC(=O)c1cccc(C(F)F)n1</td>\n <td>Mol2</td>\n <td>0.550228</td>\n <td>418.444</td>\n <td>-5.071409</td>\n </tr>\n <tr>\n <th>3</th>\n <td>CC(C)(Oc1ccc(-c2cnc(N)c(-c3ccc(Cl)cc3)c2)cc1)C...</td>\n <td>Mol4</td>\n <td>1.657056</td>\n <td>382.847</td>\n <td>-3.925969</td>\n </tr>\n <tr>\n <th>5</th>\n <td>CC#CC(=O)N[C@H]1CCCN(c2c(F)cc(C(N)=O)c3[nH]c(C...</td>\n <td>Mol6</td>\n <td>1.033424</td>\n <td>370.428</td>\n <td>-4.535280</td>\n </tr>\n <tr>\n <th>8</th>\n <td>C=CC(=O)N1CCC[C@@H](n2nc(-c3ccc(Oc4ccccc4)cc3)...</td>\n <td>Mol9</td>\n <td>0.933990</td>\n <td>440.507</td>\n <td>-4.709963</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sol_df.head()"
]
},
{
"cell_type": "markdown",
"id": "538c6a65",
"metadata": {},
"source": [
"We'll classify everything with logS > -4 as soluble"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "a5a7110b",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:19:50.963633Z",
"start_time": "2024-04-07T17:19:50.960569Z"
}
},
"outputs": [],
"source": [
"sol_df['is_sol'] = sol_df.logS > -4\n",
"sol_df.is_sol = sol_df.is_sol.astype(int)"
]
},
{
"cell_type": "markdown",
"id": "0feba54e",
"metadata": {},
"source": [
"Plot the distribution"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "0e7b4c46",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:19:53.536502Z",
"start_time": "2024-04-07T17:19:53.317517Z"
}
},
"outputs": [
{
"data": {
"text/plain": "<Figure size 557.75x500 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAisAAAHqCAYAAADF8g6kAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtTklEQVR4nO3de1iXdZ7/8df3CypfBAcQ0nFyawpQG3NFXRTtshkdatJRPA41rqlJjmbO6qyYzphQVjq503ZxlU5Duaxpq4t5GErNctvZpjxFrJo/Mehq80BrcZA4g3L//nCjUDnc8j18gOfjuri6uD+f+77f94cP9uI+fR2WZVkCAAAwlNPXBQAAADSHsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMBphBQAAGM3f1wX4SmFhmUx8d29YWHcVF1f4uox2hTGzjzGzjzGzjzG7IiIi2NcltHucWTGIwyH5+TnlcPi6kvaDMbOPMbOPMbOPMYM7EVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMBphBQAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrAADAaIQVAABgNH9fFwAA+FZIkJ+cDqvZPvWWQxfLL3upIsD3CCsAYBCnw1L1gXXN9gkYm+ylagAzcBkIAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDReBoIALyopUeTnfwJCVyDsAIAXtTSo8mB8TyWDFyNDA8AAIxGWAEAAEbzSVjJzc3VnDlzFBsbq1GjRmnZsmUqLi6WJKWkpGjgwIGKiYlp+Nq2bVvDujt37lR8fLwGDx6sKVOmKCcnxxeHAAAAvMTrYaW6ulpJSUmKiYnRX//6V73xxhu6ePGifvvb30qSTpw4odWrVysnJ6fhKzExUZJ0+PBhrV69WmvXrtXRo0c1ceJELViwQFVVVd4+DAAA4CVeDysFBQXq37+/Fi5cqK5duyo0NFSJiYk6evSoamtr9cknn2jgwIHXXTczM1Pjx4/X0KFD1aVLF82ePVuhoaHas2ePl48CAAB4i9efBrrtttv08ssvN1r21ltv6Uc/+pFyc3N16dIlpaWlKTs7W8HBwZo6daqSkpLkdDqVn5+vqVOnNlo3MjJSubm5tutwONp0GB7xTU0m1mYqxsw+xsw+d49ZS5tpzW5M//kxz+BOPn102bIsPf/883r33Xe1efNmFRYWKjY2VjNnztRzzz2nU6dOaeHChXI6nUpKSlJFRYVcLlejbQQEBKiystL2vnv2DHbXYbidybWZijGzjzGzzy1jVlepwMBuzXZpqV1+DoWHt4+fH/MM7uCzsFJeXq4VK1bo5MmT2rx5s/r166d+/fpp1KhRDX0GDRqkWbNmac+ePUpKSpLL5VJ1dXWj7VRXVys0NNT2/ouKymQ1/V4mn3A4rvxim1ibqRgz+xgz+9w5ZqFBTtVU1jTZ7pJU1Uy7JHW7bKmktKxthXgY8+xb7SVYmswnYeXMmTN6+OGH1adPH23fvl1hYWGSpHfeeUeFhYW6//77G/rW1tYqICBAkhQVFaW8vLxG28rPz9fo0aNt12BZMvYXyOTaTMWY2ceY2eeuMWtpE63ZRXv52THP4A5ev8G2tLRUs2bN0pAhQ/TKK680BBXpymWhNWvW6ODBg7IsSzk5Odq0aVPD00DTpk1TVlaWDh06pLq6OmVkZKioqEjx8fHePgwAAOAlXj+zsmPHDhUUFGjv3r3at29fo7acnBytWLFCqampunDhgsLDw7Vo0SIlJCRIkuLi4pSSktLQHhkZqfT0dIWEhHj7MAAAgJc4LKtznqArLDTvOqrDceXapom1mYoxs48xs8+dYxYW7Gzxs4Eq3266XZICxiaruKy+bYV4GPPsWxER3LPSVrxuHwAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMBphBQAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMBphBQAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNF8ElZyc3M1Z84cxcbGatSoUVq2bJmKi4slSceOHdP06dMVExOjMWPGKDMzs9G6O3fuVHx8vAYPHqwpU6YoJyfHF4cAAAC8xOthpbq6WklJSYqJidFf//pXvfHGG7p48aJ++9vfqrS0VPPmzdOkSZN09OhRPf3001qzZo2OHz8uSTp8+LBWr16ttWvX6ujRo5o4caIWLFigqqoqbx8GAPiM0+lQWLCzya+QID9flwi4lb+3d1hQUKD+/ftr4cKF8vPzU9euXZWYmKhly5Zp//79CgkJ0YwZMyRJcXFxmjBhgrZs2aJBgwYpMzNT48eP19ChQyVJs2fP1rZt27Rnzx5NnTrV24cCAD5iqfrAuiZbA8Yme7EWwPO8HlZuu+02vfzyy42WvfXWW/rRj36kvLw8RUdHN2qLjIzU9u3bJUn5+fnXhJLIyEjl5ubarsPhsL2Kx31Tk4m1mYoxs48xs8/dY9bSZlqzmxa34eOfL/MM7uT1sPJdlmXp+eef17vvvqvNmzdr06ZNcrlcjfoEBASosrJSklRRUdFsux09ewbfeOEeZnJtpmLM7GPM7HPLmNVVKjCwW7NdWmpvsY+fQ+HhZvx8mWdwB5+FlfLycq1YsUInT57U5s2b1a9fP7lcLpWVlTXqV11dre7du0uSXC6Xqqurr2kPDQ21vf+iojJZ1o3X7wkOx5VfbBNrMxVjZh9jZp87xyw0yKmaypom212Sqpppb02fbpctlZSWNdnuDcyzb5kSHNszn4SVM2fO6OGHH1afPn20fft2hYWFSZKio6P1/vvvN+qbn5+vqKgoSVJUVJTy8vKuaR89erTtGixLxv4CmVybqRgz+xgz+9w1Zi1tojW7aHEbhvxsmWdwB68/DVRaWqpZs2ZpyJAheuWVVxqCiiTFx8ersLBQGRkZqqur06FDh5SVldVwn8q0adOUlZWlQ4cOqa6uThkZGSoqKlJ8fLy3DwMAAHiJ18+s7NixQwUFBdq7d6/27dvXqC0nJ0cbN27U008/rbS0NIWFhWnlypUaMWKEpCtPB6WkpCg1NVUXLlxQZGSk0tPTFRIS4u3DAAAAXuL1sDJnzhzNmTOnyfY777xTW7dubbI9ISFBCQkJnigNAAAYiNftAwAAoxFWAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMJq/rwsAgI4kJMhPTofVZLuTPxEB2wgrAOBGToel6gPrmmwPjE/2YjVAx0DGBwAARiOsAAAAoxFWAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMBphBQAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMJpPw0pxcbHi4+N1+PDhhmUpKSkaOHCgYmJiGr62bdvW0L5z507Fx8dr8ODBmjJlinJycnxROgAA8BJ/X+04Oztby5cv15kzZxotP3HihFavXq3Jkydfs87hw4e1evVqpaena9CgQdqyZYsWLFigd999Vy6Xy1ulAwAAL/LJmZWdO3dq6dKlWrJkSaPltbW1+uSTTzRw4MDrrpeZmanx48dr6NCh6tKli2bPnq3Q0FDt2bPHG2UDAAAf8MmZlbvuuksTJkyQv79/o8CSm5urS5cuKS0tTdnZ2QoODtbUqVOVlJQkp9Op/Px8TZ06tdG2IiMjlZuba7sGh6PNh+F239RkYm2mYszsY8zssztmLXVra3urtuHjny/zDO7kk7ASERFx3eVlZWWKjY3VzJkz9dxzz+nUqVNauHChnE6nkpKSVFFRcc3lnoCAAFVWVtquoWfP4Buq3RtMrs1UjJl9jJl9rRqzukoFBnZrtktb21vs4+dQeLgZP1/mGdzBZ/esXM+oUaM0atSohu8HDRqkWbNmac+ePUpKSpLL5VJ1dXWjdaqrqxUaGmp7X0VFZbKsNpfsVg7HlV9sE2szFWNmH2Nmn50xCw1yqqaypsl2l6SqNrS3pk+3y5ZKSsuaL9TDmGffMiU4tmdGhZV33nlHhYWFuv/++xuW1dbWKiAgQJIUFRWlvLy8Ruvk5+dr9OjRtvdlWTL2F8jk2kzFmNnHmNnX2jFrqUtb21u1DUN+tswzuINR71mxLEtr1qzRwYMHZVmWcnJytGnTJiUmJkqSpk2bpqysLB06dEh1dXXKyMhQUVGR4uPjfVw5AADwFLedWSkvL1dQUFCbthEfH68VK1YoNTVVFy5cUHh4uBYtWqSEhARJUlxcnFJSUhraIyMjlZ6erpCQEDccAQAAMJHtsBIbG6sjR45cs/zHP/6xPvzwQ9sFnD59utH3999/f6PLQFdLSEhoCC8AAKDja1VY+fzzz7Vq1SpZlqXy8nI9+OCDjdrLy8vVo0cPjxQIAAA6t1aFlVtuuUX33HOPSkpK9NFHHyk2NrZRe9euXTVmzBiPFAgAADq3Vl8GmjFjhiTp5ptv1qRJkzxVDwAAQCO271mZNGmSjh8/rs8++0zWVc+jEWIAAIC72Q4rzz33nNLT0xURESF//29XdzgchBUAAOB2tsPK7t279cc//lF33323J+oBAABoxPZL4SorK2/ojbEAAAA3wnZY+fGPf6ysrCxP1AIAAHAN25eBampqtHz5cv3xj39UeHh4o7ZNmza5rTAAAADpBsJKdHS0oqOjPVELAADANWyHlUcffdQTdQAAADcpKCjQ+PHj9eabb6pPnz5e3//MmTMVGxurRYsWuWV7tsPKihUrmmxbs2ZNm4oBAABt16dPH+Xk5Pi6DLexfYPt1UpKSrR3714FBga6ox4AANBG586dU79+/XTu3Dm99tpr+ulPf6phw4ZpwoQJyszMbNU2ysvLtWTJEg0fPlyjRo3S3Llz9emnnza0Z2Zmavz48RoyZIgmTJigP//5z546HPtnVq539uSDDz7Qa6+95paCAACAe1iWpTVr1mj37t267bbb9N5772nhwoW6++67ddNNNzW77saNG1VeXq6//OUvcjqdWrVqlf7pn/5JGzZs0I4dO7R27Vq98MILio2N1ZEjR/Too4/K5XIpPj7e7cfR5jMrkjRy5EgdOnTIHZsCAABu8r//+7+yLEtbt25Vdna24uLi9N///d8tBhVJCggIUG5urnbt2qULFy7omWee0YYNGyRJr7/+uhITExUXFyc/Pz/FxcUpMTFRW7du9chxtDmsXLp0Sbt27VJYWJg76gEAAG7y/e9/X6+++qrOnz+v+fPnKzY2Vs8884xqampaXPfhhx/W3LlztX37dt1777267777tH//fklSYWGh+vbt26j/zTffrPPnz3vkOGxfBurfv78cDkejZX5+fvrd737ntqIAAEDbFRcX6/Lly3rxxRdVX1+vjz76SL/+9a/1wx/+UDNmzGh23dOnT2vMmDGaPXu2ysrK9Nprr2nJkiU6dOiQbr75Zp05c6ZR/7NnzyoiIsIjx2E7rFz94jen06lbbrnFYwUCAIAb43A49NBDD+mll15SXFycevXqJUkKDQ1tcd3MzEydPHlSL774osLCwhQUFKTAwEB17dpV06ZN0+9+9zuNHj1asbGxOnr0qLZt26ZVq1Z55Dhsh5XY2FjV19fr448/1rlz53TTTTepZ8+enqgNAAC0QWhoqFatWqXU1FR9+eWXCg4O1i9/+Uvdd999La77m9/8Rk8++aTGjx+vmpoa3XbbbVq/fr26deum++67T+Xl5XrqqadUUFCgXr16admyZZo0aZJHjsN2WPnqq680f/585ebmKiQkRCUlJbr11lu1ceNG9e7d2xM1AgAAG26++WadPn1akjRt2jRNmzbN9ja6d++u3//+9022T58+XdOnT79u26uvvmp7f82xfYPt73//e9166606cuSI3n//fR0+fFgDBgzghXAAAMAjbJ9ZOXTokPbt26fu3btLkoKDg5WamqqxY8e6vTgAAOB+x48f16xZs5ps79Onj958800vVtQ822Glvr7+mqeBHA6HunTp4raiAACA5wwaNKhdvY7f9mWg4cOHKzU1VZWVlZKkiooKpaamKjY21u3FAQAA2D6zkpycrDlz5ig2NlYhISG6ePGibr/9dv3pT3/yRH0AAKCTsxVWLMvSpUuX9Oabb+rDDz9UUVGRzp8/r7lz58rPz89TNQIAgE6s1ZeBKisr9cADD+jZZ5+Vv7+/RowYoREjRuiFF17QzJkzGy4LAQAAuFOrw8qGDRvUpUsXPfHEEw3LevbsqXfffVeXLl3SSy+95JECAQBA59bqsPLWW2/pqaeeuuZttT179tQTTzyhffv2ub04AADQtHrL6pD7ulqr71kpKirSLbfcct22AQMG6KuvvnJbUQBgqpAgPzkdTf+j7WzzZ9kDred0OLT3xBcqrqj16H7CunfVfXd+3/Z6RUVFevzxx3XkyBH5+flp4sSJeuyxx+Tvb+/5nlb3DgoKUklJyXU//OjixYtyuVy2dgwA7ZHTYan6wLom2wPjk71YDSAVV9Tqy7IaX5dxXYsXL1avXr303nvvqbCwUAsWLFBGRoaSkpJsbafVfwPExcVpy5Yt12177bXXNHjwYFs7BgAAHdfnn3+uI0eOKDk5WS6XS3379tUjjzzSZJZoTqvPrPzqV7/SlClTVFJSonHjxikiIkJffvml9u7dq9dff12bN2+2vXMAANAx5eXlKSQkRL169WpYdvvtt6ugoEBff/21evTo0epttTqs/PCHP9Qrr7yilJQUbdmyRQ6HQ5ZlKTo6Wunp6Ro4cKC9owAAAB1WRUXFNbeIfPN9ZWWlZ8KKJA0ZMkRZWVk6e/asiouLFRERoT59+tjZBAAA6AQCAwNVVVXVaNk333/zYcitZft1+5LUt29f9e3b90ZWBQAAnUBUVJQuXryowsJChYeHS5I+/fRT9e7dW8HBwba2xUN2AADA7W699VYNHTpUzzzzjMrLy3X27FmtX79e06ZNs72tGzqzAgAAzBDWvaux+0hLS9OTTz6psWPHyul0atKkSXrkkUdsb4ewAgBAO1VvWTf0srYb3ZfT4bC1Tnh4uNLS0tq8by4DAQDQTtkND+1lX9fs22d7BgAAaAXCCgAAMBphBQAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrAADAaIQVAADaK6u+Y+7rKrxuHwCA9srhlP7fn6WKQs/up3u4dMfEG169uLhYiYmJeuqppzR8+HDb6xNWAABozyoKpfILvq6iSdnZ2Vq+fLnOnDlzw9vgMhAAAPCInTt3aunSpVqyZEmbtkNYAQAAHnHXXXfp7bff1rhx49q0HS4DAQAAj4iIiHDLdjizAgAAjEZYAQAARiOsAAAAo3HPCgAA7Vn38I6xj2YQVgAAaK+s+ja9rM32vhw3fkHm9OnTN7wul4EAAGiv2hAejN7XVQgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRfBpWiouLFR8fr8OHDzcsO3bsmKZPn66YmBiNGTNGmZmZjdbZuXOn4uPjNXjwYE2ZMkU5OTneLhsAAHiRz8JKdna2EhMTdebMmYZlpaWlmjdvniZNmqSjR4/q6aef1po1a3T8+HFJ0uHDh7V69WqtXbtWR48e1cSJE7VgwQJVVVX56jAAAICH+SSs7Ny5U0uXLtWSJUsaLd+/f79CQkI0Y8YM+fv7Ky4uThMmTNCWLVskSZmZmRo/fryGDh2qLl26aPbs2QoNDdWePXt8cRgAAMALfBJW7rrrLr399tsaN25co+V5eXmKjo5utCwyMlK5ubmSpPz8/GbbAQBAx+OTT12OiIi47vKKigq5XK5GywICAlRZWdmqdjscDtureNw3NZlYm6kYM/sYM/uuHrOWhs7T7a3aho9/vswzuJNPwkpTXC6XysrKGi2rrq5W9+7dG9qrq6uvaQ8NDbW9r549g2+8UA8zuTZTMWb2MWb29ewZLNVVKjCwW7P9PN3eYh8/h8LDzfj5Ms/gDkaFlejoaL3//vuNluXn5ysqKkqSFBUVpby8vGvaR48ebXtfRUVlsqwbr9UTHI4rv9gm1mYqxsw+xsy+745ZSHenaiprmuzrklTlwfbW9Ol22VJJaVmT7d7APPuWKcGxPTPqPSvx8fEqLCxURkaG6urqdOjQIWVlZWnq1KmSpGnTpikrK0uHDh1SXV2dMjIyVFRUpPj4eNv7siwzv0yuzdQvxowx8+aYSZLVzJen21u9DYPGrLN/oe2MOrMSGhqqjRs36umnn1ZaWprCwsK0cuVKjRgxQpIUFxenlJQUpaam6sKFC4qMjFR6erpCQkJ8WzgAAPAYn4eV06dPN/r+zjvv1NatW5vsn5CQoISEBE+XBQAADGHUZSAAAICrEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEbz+UvhAADu5XQ6FBbc9N+i9ZZDF8sve7EioG0IKwDQ4ViqPrCuydaAsclerAVoOy4DAQAAoxFWAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKP5+7oAAPAmV1CAHI6m2y1Lqiqv9l5BAFpEWAHQqTgc0vNv5TbZvvje/l6sBkBrcBkIAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA03rMCAN/hcDoUGBxwzfKq2styBQWorgP8ied0OhQW3PSB1FsOXSy/7MWKgOYRVgDgu6zrvzQuMLCbKitr9JufdYSXxlmqPrCuydaAsclerAVoWQf4GwEAAHRkhBUAAGA0wgoAADAaYQUAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMBphBQAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrAADAaP6+LgAA3MUVFCCHo/k+jpY6ADAOYQVAh+FwSM+/ldtsnyU/G+ClagC4C5eBAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADCakWFlz549uuOOOxQTE9PwlZycLEk6duyYpk+frpiYGI0ZM0aZmZk+rhYAAHiSkZ+6fOLECSUkJGjNmjWNlpeWlmrevHn69a9/rcTERB09elQLFy5Uv379NGjQIB9VCwAAPMnIMysnTpzQwIEDr1m+f/9+hYSEaMaMGfL391dcXJwmTJigLVu2+KBKAADgDcaFlfr6ep08eVL/+Z//qZ/85CcaPXq0Hn/8cZWWliovL0/R0dGN+kdGRio3N9dH1QIAAE8z7jJQcXGx7rjjDt17771KS0tTSUmJHnvsMSUnJysiIkIul6tR/4CAAFVWVtrej8Phrord55uaTKzNVIyZfYyZfVePWUtD5+l2r9TQxvnBPIM7GRdWwsPDG13WcblcSk5O1i9+8QtNmTJF1dXVjfpXV1ere/futvfTs2dwm2v1FJNrMxVjZl9HHLOq2ssKDOzWfCeHmu/TTLvL1U1yOFrch6fbPb4PP4fCw90zPzriPIP3GRdWcnNz9cYbb+gf//Ef5fi/SF5bWyun06lBgwbpX//1Xxv1z8/PV1RUlO39FBWVybLcUrLbOBxXfrFNrM1UjJl9HXnMXEEBqqysab6Tpeb7XKfd4bgSVKqqaiTLanZ9l6QqD7Z7Yx/dLlsqKS1rtoaWdOR5Zpe7gl9nZlxYCQkJ0ZYtW/S9731Pc+bM0Zdffql169Zp8uTJuvfee/WHP/xBGRkZmjFjhrKzs5WVlaX169fb3o9lydhfIJNrMxVjZh9j1nrfjFPDf1vq7+F2r9TgprnBPIM7GHeDbe/evfXSSy/pwIEDio2N1dSpU3XnnXdq1apVCg0N1caNG7Vv3z4NHz5cK1eu1MqVKzVixAhflw0AADzEuDMrkhQbG6utW7det+3OO+9ssg0AAHQ8xp1ZAQAA+C7CCgAAMBphBQAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrAADAaIQVAABgNMIKAAAwGmEFAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDR/H1dAICOwxUUIIej6XbLkqrKq71XEIAOgbACwG0cDun5t3KbbF98b38vVgOgoyCsAGg3Wjpz42iuEUC7RVgB0G60dOZmyc8GeLEaAN5CWAHgNQ6nQ4HBAU22c08LgOshrADwHot7WgDYx6PLAADAaIQVAABgNMIKAAAwGmEFAAAYjRtsARijpaeFvPIeFYfk6nqd/dTXXVnOq1wAryOsADBHC08Lees9Kpc+++CaZV26+OtS3SVJ/bxSA4BvcRkIAAAYjbACAACMRlgBAABGI6wAAACjEVYAAIDRCCsAAMBohBUAAGA0wgoAADAaYQUAABiNsAIAAIzG6/YBAI04nQ6FBTf9t2y95dDF8sterAidHWEFAHAVS9UH1jXZGjA22Yu1AIQVAP/HFRSg5j7U2LKkqvJq7xUEAP+HsAJAkuRwNP+Jx4vv7e/FagDgW9xgCwAAjMaZFcANOsMlFIfTocDggOb7NDcIAHCDCCuAG3SKSyhW88coSUt+NsBLxQDoTAgrQAfR0tkdh9Mhq96SJFXVXpYrqPFZEs6KADAVYQUwRFsvJbV0dmfJzwY0tAcGdlNlZc017QBgIsIKYIhOcSkJAG4AYQVt1hluLgUA+A5hBW3W4uWH+wY0+xSJN8JMWwNVi/eDeOF+j5aexuGeEwAdFWEFntfCUyTeuLzR1kssrbkfxONaGEfuOQHQURFW0CJPn1Vo6YwBl5EAoHMjrKBFHj+rYMCZFwCAuQgraPdq6q59Z8jVWjr74+n7QXj7KzoSp9OhsODmP62l3mI+w30IK2j36t3xZlVP3w/C21/RoViqPrCu2R4BY5O9VAs6Az7IEAAAGI0zK4bjHSYAgM6OsGI43moKAOjsuAwEAACMxpmVdo53lAAAOjrCSnvHO0oAAB0cYQUA4HZOp0Oqq1Ro0PXvNqi3HLpYftnLVaG9Iqx0ci09bST5/mVlLV7qkuXFagC0jiX9NU01lTXX/Q3lPSywg7DSybX0tJFkwMvKWnph2328TA0AOjKeBgIAAEYjrAAAAKMRVgAAgNG4Z8XHrr7Btaq28ScIe/rTfn198ywAAC0hrHhYS0/bOBwO/fO+Uw3fBwZ2U2VlTcP3nv60X5/fPAugU3I6HQoLbvrkPo8247sIKx7W0tM2hAUAnZOl6gPrmmzl0WZ8V7sMK0VFRXr88cd15MgR+fn5aeLEiXrsscfk7+/dw2kP7ygBAKC9a5dhZfHixerVq5fee+89FRYWasGCBcrIyFBSUpJX62gX7ygBAKCda3dh5fPPP9eRI0f0X//1X3K5XOrbt68eeeQRrVu3zuthBQDgGyFBfnI6mn57Nfe8dCztLqzk5eUpJCREvXr1alh2++23q6CgQF9//bV69Ojhw+oAAN7gdLTtnpeWwo5E4DFJuwsrFRUVcrlcjZZ9831lZWWrw4rTKVlu+EiZrv4tv6qmpT7ftDscUlc/py53cTaqrbXre6rdhBqaav/mlqBuV42ZN2twV7u3amhqnrlj++6q0ZPtrdpGly7XLOvSxV8OXZlwDv9uza7v6XYTamhNjfLrKnWRmsoELW3D2cKPsi3rOx2Wav6S1uz63e7+dYs1wDscluWO/2V7z9tvv62VK1fq8OHDDctOnz6tiRMn6sMPP1RwcLAPqwMAAO7W7jJjVFSULl68qMLCwoZln376qXr37k1QAQCgA2p3YeXWW2/V0KFD9cwzz6i8vFxnz57V+vXrNW3aNF+XBgAAPKDdXQaSpMLCQj355JM6fPiwnE6nJk2apKVLl8rPz8/XpQEAADdrl2EFAAB0Hu3uMhAAAOhcCCsAAMBohBUAAGA0wgoAADAaYcWHjh07pv79+ysmJqbha8aMGU32T0lJ0cCBAxv137Ztmxcr9j27Y3bs2DFNnz5dMTExGjNmjDIzM71YrXmSk5M1c+bMZvswzxprzZgxz6RTp07pwQcf1NChQzV8+HAlJyerpKSkyf7MM9hiwWdeffVV6+///u9b3X/y5MnWjh07PFiR+eyM2cWLF63Y2Fhr8+bNVl1dnfXBBx9YMTEx1rFjxzxcpZkyMzOt/v37tzh+zLNvtWbMmGeWVVNTY40aNcp64YUXrLq6Oqu0tNSaNWuWtWzZsibXYZ7BDs6s+NCJEyc0cODAVvWtra3VJ5980ur+HZWdMdu/f79CQkI0Y8YM+fv7Ky4uThMmTNCWLVs8XKV58vPztX79ek2fPr3Zfsyzb7V2zJhnUteuXbV//34tWLBA/v7+Ki0tVVVVlcLCwq7bn3kGu9rdBxm2J9XV1bpw4cJ12yIiInTixAmFh4frnnvuUXl5uWJjY7V8+XL17t37mv65ubm6dOmS0tLSlJ2dreDgYE2dOlVJSUlydqBP2nLnmOXl5Sk6OrrRssjISG3fvt0jtftKS2PmdDq1ZMkSpaSk6Pjx4/rss8+a3BbzzP6YMc+ujFlgYKAk6f7771dOTo4iIyM1d+7c6/bvLPMM7kNY8aBjx47pwQcfvG5bWlqabrrpJo0cOVIPPPCA6urqtHr1as2bN087d+685m28ZWVlio2N1cyZM/Xcc8/p1KlTWrhwoZxOp5KSkrxxOF7hzjG73id0BwQEqLKy0mP1+0JzY/biiy/qP/7jPzRq1CjdfffdOn78eLPbYp7ZHzPm2ZUx++lPfypJysjIUE1NjVJTUzVnzhzt2rWr0/57Bjfy9XUofKuoqMiKjo62Tp8+3ar+6enp1uTJkz1cldmaG7PVq1dbixYtarRs06ZNVkJCgpeq873du3dbkydPtmpqaizLsqy0tDRb90lZVuebZ3bHjHl2fYWFhVZ0dLR18uTJVvXvbPMM9nC+zUe++OILrVmzRhUVFQ3LamtrJV35q+xq77zzjrZu3dpoWW1t7XX7dlR2xyw6Olp5eXmNluXn5ysqKsqzhRpk9+7d+uyzzzRy5EgNGzZMf/rTn5Sdna1hw4apoKDgmv7MM/tjxjyTzp07pzFjxujLL79sWPbN7+b3vve9a/ozz2Cbr9NSZ1VVVWWNGjXKWr16tVVdXW0VFRVZ8+fPt2bNmnXd/vv377cGDRpkffDBB1Z9fb310UcfWcOHD7d27drl3cJ9yO6YFRcXW8OGDbP+5V/+xaqtrbUOHjxoxcTEWAcPHvRu4QZp6SwB8+xaLY0Z88yy6uvrrcmTJ1uLFy+2ysvLraKiIutXv/qVlZSUdN3+zDPYRVjxoVOnTlmzZ8+2hg0bZg0bNsxaunSpVVJS0tA+btw4a8OGDQ3f/9u//Zt1zz33WH/7t39rjR071tq8ebMPqvYtu2N2/PhxKzEx0YqJibHGjh1rvf766z6o2hzX+x8v86x5rRkz5pllffHFF9ajjz5qxcbGWnfddZeVmppqff311w3tzDO0BZ+6DAAAjMY9KwAAwGiEFQAAYDTCCgAAMBphBQAAGI2wAgAAjEZYAQAARiOsAAAAoxFWAACA0QgrQCd07tw59evXT+fOnWvztrKzszVnzhwNGzZMMTEx+vnPf66NGzeK900CcBd/XxcAoP06e/as5syZoyeffFIvvfSS/P39dfz4cS1atEjV1dV65JFHfF0igA6AsAJ0cufPn9e6det0+PBhOZ1OjRgxQo899phuuukmSdIHH3ygZ599VmfOnFF0dLSGDh2q48eP69VXX9WJEyfUpUsXjRs3Tl27dpUkDR48WCtWrHDLWRsAkLgMBHRqly5d0kMPPSQ/Pz/t379fe/fulSTNnz9fly5d0rlz5zR//nw98MADOnLkiJYuXapt27Y1rD98+HC5XC5NnTpVGzZs0MGDB1VeXq5x48Zp3rx5vjosAB0MYQXoxD788EOdPXtWTzzxhIKDg9WjRw898cQTys3N1ccff6ysrCwNGDBAiYmJ8vf317Bhw/SLX/yiYf2ePXtq9+7dGjt2rN5++23NnTtXw4cP18MPP6z/+Z//8d2BAehQCCtAJ1ZUVKTQ0FAFBQU1LAsKClJISIjOnz+vL774Qj/4wQ8ardO3b99G3/fs2VOLFy/Wjh07lJ2drfT0dFVUVOihhx7S5cuXvXIcADo2wgrQicXGxqqkpETl5eUNy8rKylRSUqKIiAj94Ac/UEFBQaN1vvt9cnKy/uEf/qHhe5fLpZEjRyolJUXnz59XaWmp5w8CQIdHWAE6sbCwMEVGRiolJUVlZWUqKytTamqq/uZv/kZDhgxRQkKCTp06pV27duny5cs6duyY/v3f/71h/Z///Oc6cOCANm3apAsXLsiyLBUUFCg9PV1/93d/p7CwMB8eHYCOwmHxMgSg0zl37pzGjh2rAwcOyM/PT2vXrtXRo0dVW1urkSNHavny5erTp48k6S9/+YueffZZFRQUaMCAAfr+97+vkpISbdy4UZL0/vvv6+WXX9bJkydVXV2tsLAwxcfHa9GiRerRo4cvDxNAB0FYAdCkL774QiUlJbrjjjsalq1du1ZfffWV/vCHP/iwMgCdCZeBADSppKREv/zlL/Xxxx9LknJzc/XnP/9ZP/nJT3xcGYDOhDMrAJqVmZmp9PR0ffXVVwoPD9eMGTM0e/ZsX5cFoBMhrAAAAKNxGQgAABiNsAIAAIxGWAEAAEYjrAAAAKMRVgAAgNEIKwAAwGiEFQAAYDTCCgAAMBphBQAAGO3/Ays5IWoIZPcLAAAAAElFTkSuQmCC"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.displot(x=\"logS\", hue=\"is_sol\", data=sol_df);"
]
},
{
"cell_type": "markdown",
"id": "efdd3793",
"metadata": {},
"source": [
"Add a fingerprint to the dataframe"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "486c972f",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:20:05.296505Z",
"start_time": "2024-04-07T17:20:05.062167Z"
}
},
"outputs": [],
"source": [
"sol_df['fp'] = sol_df.SMILES.apply(uru.smi2numpy_fp)"
]
},
{
"cell_type": "markdown",
"id": "14c0b08b",
"metadata": {},
"source": [
"Split into training and test"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "546b4800",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:20:13.047936Z",
"start_time": "2024-04-07T17:20:13.040781Z"
}
},
"outputs": [],
"source": [
"train, test = train_test_split(sol_df)"
]
},
{
"cell_type": "markdown",
"id": "91387aab",
"metadata": {},
"source": [
"Train the model"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "c7e7e75b",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:20:15.237473Z",
"start_time": "2024-04-07T17:20:15.235655Z"
}
},
"outputs": [],
"source": [
"cls = LGBMClassifier()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "ccbbdef8",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:20:19.590310Z",
"start_time": "2024-04-07T17:20:16.670166Z"
}
},
"outputs": [
{
"data": {
"text/plain": "LGBMClassifier()",
"text/html": "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LGBMClassifier()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LGBMClassifier</label><div class=\"sk-toggleable__content\"><pre>LGBMClassifier()</pre></div></div></div></div></div>"
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cls.fit(np.stack(train.fp), train.is_sol)"
]
},
{
"cell_type": "markdown",
"id": "79db6d38",
"metadata": {},
"source": [
"Predict the test set"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "dbe15e18",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:20:22.583512Z",
"start_time": "2024-04-07T17:20:21.791372Z"
}
},
"outputs": [],
"source": [
"pred_vals = cls.predict_proba(np.stack(test.fp))[:, 1]"
]
},
{
"cell_type": "markdown",
"id": "fd66b359",
"metadata": {},
"source": [
"Calculate the AUC"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "14e0bf4e",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:20:24.726138Z",
"start_time": "2024-04-07T17:20:24.719026Z"
}
},
"outputs": [
{
"data": {
"text/plain": "0.7719736298478508"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roc_auc_score(test.is_sol, pred_vals)"
]
},
{
"cell_type": "markdown",
"id": "30fd3af6",
"metadata": {},
"source": [
"Define a function to bootstrap a 95% confidence interval for the AUC"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "3a5c4c57",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:20:29.358142Z",
"start_time": "2024-04-07T17:20:29.352910Z"
}
},
"outputs": [],
"source": [
"def bootstrap_AUC(truth, pred, num_iterations=1000):\n",
" \"\"\" Calculate the 95% confidence interval (CI) for an AUC\n",
" :param truth: the true values\n",
" :param pred: the predicted values\n",
" :param num_iterations: number of bootstrap iterations\n",
" :return: 95% CI lower bound, AUC, 95% CI upper bound\n",
" \"\"\"\n",
" result_df = pd.DataFrame({\"truth\": truth, \"pred\": pred})\n",
" auc_val = roc_auc_score(truth, pred)\n",
" auc_list = []\n",
" for _ in range(0, num_iterations):\n",
" sample_df = resample(result_df)\n",
" auc_list.append(roc_auc_score(sample_df.truth, sample_df.pred))\n",
" return np.percentile(auc_list, 2.5), auc_val, np.percentile(auc_list, 97.5)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "43613027",
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-07T17:20:32.390619Z",
"start_time": "2024-04-07T17:20:31.743195Z"
}
},
"outputs": [
{
"data": {
"text/plain": "(0.731250560565505, 0.7719736298478508, 0.8124190589861915)"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bootstrap_AUC(test.is_sol, pred_vals)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93cd6b11",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment