Created
September 15, 2021 13:27
-
-
Save iwatobipen/4b3781417ed570036fef3a1927c2b827 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "enormous-default", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"import matplotlib.pyplot as plt\n", | |
"import os\n", | |
"import numpy as np\n", | |
"from rdkit import Chem\n", | |
"from rdkit.Chem import DataStructs\n", | |
"from rdkit.Chem import rdFingerprintGenerator\n", | |
"from rdkit.Chem import rdBase\n", | |
"from rdkit import RDConfig\n", | |
"from PRF import prf\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn import metrics\n", | |
"from scipy.stats import norm\n", | |
"\n", | |
"# if SOL > -1, solubility class is defined as high so I defined threshold is -1.\n", | |
"def calc_delta_y(value, scale=0.00001):\n", | |
" v = norm.cdf(value, scale=scale)\n", | |
" return [v, 1-v]\n", | |
"\n", | |
"def mol2fp(mol, fpgen):\n", | |
" fp = fpgen.GetFingerprint(mol)\n", | |
" arr = np.zeros((1,))\n", | |
" DataStructs.ConvertToNumpyArray(fp, arr)\n", | |
" return arr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "hundred-smile", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train = os.path.join(RDConfig.RDDocsDir,'Book/data/solubility.train.sdf')\n", | |
"test = os.path.join(RDConfig.RDDocsDir,'Book/data/solubility.test.sdf')\n", | |
"\n", | |
"data_train = [m for m in Chem.SDMolSupplier(train)]\n", | |
"data_test = [m for m in Chem.SDMolSupplier(test)]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "rocky-bruce", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1e-05\n", | |
"0.8599221789883269\n", | |
"0.8910505836575876\n" | |
] | |
} | |
], | |
"source": [ | |
"scale=0.00001\n", | |
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n", | |
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n", | |
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n", | |
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n", | |
"delta_sol_train = [s-(-1) for s in sol_train]\n", | |
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n", | |
"y_train = [np.argmax(y) for y in delta_y_train]\n", | |
"\n", | |
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n", | |
"delta_sol_test = [s-(-1) for s in sol_test]\n", | |
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n", | |
"y_test = [np.argmax(y) for y in delta_y_test]\n", | |
"\n", | |
"\n", | |
"prf_cls = prf()\n", | |
"rf_cls = RandomForestClassifier(random_state=1)\n", | |
"prf_cls.fit(X_train, py=delta_y_train)\n", | |
"rf_cls.fit(X_train, y_train)\n", | |
"print(scale)\n", | |
"print(prf_cls.score(X_test, y_test))\n", | |
"print(rf_cls.score(X_test, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "acquired-arena", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.3\n", | |
"0.8793774319066148\n", | |
"0.8910505836575876\n" | |
] | |
} | |
], | |
"source": [ | |
"scale=0.3\n", | |
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n", | |
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n", | |
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n", | |
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n", | |
"delta_sol_train = [s-(-1) for s in sol_train]\n", | |
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n", | |
"y_train = [np.argmax(y) for y in delta_y_train]\n", | |
"\n", | |
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n", | |
"delta_sol_test = [s-(-1) for s in sol_test]\n", | |
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n", | |
"y_test = [np.argmax(y) for y in delta_y_test]\n", | |
"\n", | |
"\n", | |
"prf_cls = prf()\n", | |
"rf_cls = RandomForestClassifier(random_state=1)\n", | |
"prf_cls.fit(X_train, py=delta_y_train)\n", | |
"rf_cls.fit(X_train, y_train)\n", | |
"print(scale)\n", | |
"print(prf_cls.score(X_test, y_test))\n", | |
"print(rf_cls.score(X_test, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "documented-buffalo", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.6\n", | |
"0.8988326848249028\n", | |
"0.8910505836575876\n" | |
] | |
} | |
], | |
"source": [ | |
"scale=0.6\n", | |
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n", | |
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n", | |
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n", | |
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n", | |
"delta_sol_train = [s-(-1) for s in sol_train]\n", | |
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n", | |
"y_train = [np.argmax(y) for y in delta_y_train]\n", | |
"\n", | |
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n", | |
"delta_sol_test = [s-(-1) for s in sol_test]\n", | |
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n", | |
"y_test = [np.argmax(y) for y in delta_y_test]\n", | |
"\n", | |
"\n", | |
"prf_cls = prf()\n", | |
"rf_cls = RandomForestClassifier(random_state=1)\n", | |
"prf_cls.fit(X_train, py=delta_y_train)\n", | |
"rf_cls.fit(X_train, y_train)\n", | |
"print(scale)\n", | |
"print(prf_cls.score(X_test, y_test))\n", | |
"print(rf_cls.score(X_test, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "static-freedom", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.8\n", | |
"0.8910505836575876\n", | |
"0.8910505836575876\n" | |
] | |
} | |
], | |
"source": [ | |
"scale=0.8\n", | |
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n", | |
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n", | |
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n", | |
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n", | |
"delta_sol_train = [s-(-1) for s in sol_train]\n", | |
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n", | |
"y_train = [np.argmax(y) for y in delta_y_train]\n", | |
"\n", | |
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n", | |
"delta_sol_test = [s-(-1) for s in sol_test]\n", | |
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n", | |
"y_test = [np.argmax(y) for y in delta_y_test]\n", | |
"\n", | |
"\n", | |
"prf_cls = prf()\n", | |
"rf_cls = RandomForestClassifier(random_state=1)\n", | |
"prf_cls.fit(X_train, py=delta_y_train)\n", | |
"rf_cls.fit(X_train, y_train)\n", | |
"print(scale)\n", | |
"print(prf_cls.score(X_test, y_test))\n", | |
"print(rf_cls.score(X_test, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "simple-friendship", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.0\n", | |
"0.9027237354085603\n", | |
"0.8910505836575876\n" | |
] | |
} | |
], | |
"source": [ | |
"scale=1.0\n", | |
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n", | |
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n", | |
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n", | |
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n", | |
"delta_sol_train = [s-(-1) for s in sol_train]\n", | |
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n", | |
"y_train = [np.argmax(y) for y in delta_y_train]\n", | |
"\n", | |
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n", | |
"delta_sol_test = [s-(-1) for s in sol_test]\n", | |
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n", | |
"y_test = [np.argmax(y) for y in delta_y_test]\n", | |
"\n", | |
"\n", | |
"prf_cls = prf()\n", | |
"rf_cls = RandomForestClassifier(random_state=1)\n", | |
"prf_cls.fit(X_train, py=delta_y_train)\n", | |
"rf_cls.fit(X_train, y_train)\n", | |
"print(scale)\n", | |
"print(prf_cls.score(X_test, y_test))\n", | |
"print(rf_cls.score(X_test, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ideal-algorithm", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment