Skip to content

Instantly share code, notes, and snippets.

@iwatobipen
Created September 15, 2021 13:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iwatobipen/4b3781417ed570036fef3a1927c2b827 to your computer and use it in GitHub Desktop.
Save iwatobipen/4b3781417ed570036fef3a1927c2b827 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "enormous-default",
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"import numpy as np\n",
"from rdkit import Chem\n",
"from rdkit.Chem import DataStructs\n",
"from rdkit.Chem import rdFingerprintGenerator\n",
"from rdkit.Chem import rdBase\n",
"from rdkit import RDConfig\n",
"from PRF import prf\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import metrics\n",
"from scipy.stats import norm\n",
"\n",
"# if SOL > -1, solubility class is defined as high so I defined threshold is -1.\n",
"def calc_delta_y(value, scale=0.00001):\n",
" v = norm.cdf(value, scale=scale)\n",
" return [v, 1-v]\n",
"\n",
"def mol2fp(mol, fpgen):\n",
" fp = fpgen.GetFingerprint(mol)\n",
" arr = np.zeros((1,))\n",
" DataStructs.ConvertToNumpyArray(fp, arr)\n",
" return arr"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "hundred-smile",
"metadata": {},
"outputs": [],
"source": [
"train = os.path.join(RDConfig.RDDocsDir,'Book/data/solubility.train.sdf')\n",
"test = os.path.join(RDConfig.RDDocsDir,'Book/data/solubility.test.sdf')\n",
"\n",
"data_train = [m for m in Chem.SDMolSupplier(train)]\n",
"data_test = [m for m in Chem.SDMolSupplier(test)]\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "rocky-bruce",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1e-05\n",
"0.8599221789883269\n",
"0.8910505836575876\n"
]
}
],
"source": [
"scale=0.00001\n",
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n",
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n",
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n",
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n",
"delta_sol_train = [s-(-1) for s in sol_train]\n",
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n",
"y_train = [np.argmax(y) for y in delta_y_train]\n",
"\n",
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n",
"delta_sol_test = [s-(-1) for s in sol_test]\n",
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n",
"y_test = [np.argmax(y) for y in delta_y_test]\n",
"\n",
"\n",
"prf_cls = prf()\n",
"rf_cls = RandomForestClassifier(random_state=1)\n",
"prf_cls.fit(X_train, py=delta_y_train)\n",
"rf_cls.fit(X_train, y_train)\n",
"print(scale)\n",
"print(prf_cls.score(X_test, y_test))\n",
"print(rf_cls.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "acquired-arena",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.3\n",
"0.8793774319066148\n",
"0.8910505836575876\n"
]
}
],
"source": [
"scale=0.3\n",
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n",
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n",
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n",
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n",
"delta_sol_train = [s-(-1) for s in sol_train]\n",
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n",
"y_train = [np.argmax(y) for y in delta_y_train]\n",
"\n",
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n",
"delta_sol_test = [s-(-1) for s in sol_test]\n",
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n",
"y_test = [np.argmax(y) for y in delta_y_test]\n",
"\n",
"\n",
"prf_cls = prf()\n",
"rf_cls = RandomForestClassifier(random_state=1)\n",
"prf_cls.fit(X_train, py=delta_y_train)\n",
"rf_cls.fit(X_train, y_train)\n",
"print(scale)\n",
"print(prf_cls.score(X_test, y_test))\n",
"print(rf_cls.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "documented-buffalo",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6\n",
"0.8988326848249028\n",
"0.8910505836575876\n"
]
}
],
"source": [
"scale=0.6\n",
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n",
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n",
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n",
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n",
"delta_sol_train = [s-(-1) for s in sol_train]\n",
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n",
"y_train = [np.argmax(y) for y in delta_y_train]\n",
"\n",
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n",
"delta_sol_test = [s-(-1) for s in sol_test]\n",
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n",
"y_test = [np.argmax(y) for y in delta_y_test]\n",
"\n",
"\n",
"prf_cls = prf()\n",
"rf_cls = RandomForestClassifier(random_state=1)\n",
"prf_cls.fit(X_train, py=delta_y_train)\n",
"rf_cls.fit(X_train, y_train)\n",
"print(scale)\n",
"print(prf_cls.score(X_test, y_test))\n",
"print(rf_cls.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "static-freedom",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8\n",
"0.8910505836575876\n",
"0.8910505836575876\n"
]
}
],
"source": [
"scale=0.8\n",
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n",
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n",
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n",
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n",
"delta_sol_train = [s-(-1) for s in sol_train]\n",
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n",
"y_train = [np.argmax(y) for y in delta_y_train]\n",
"\n",
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n",
"delta_sol_test = [s-(-1) for s in sol_test]\n",
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n",
"y_test = [np.argmax(y) for y in delta_y_test]\n",
"\n",
"\n",
"prf_cls = prf()\n",
"rf_cls = RandomForestClassifier(random_state=1)\n",
"prf_cls.fit(X_train, py=delta_y_train)\n",
"rf_cls.fit(X_train, y_train)\n",
"print(scale)\n",
"print(prf_cls.score(X_test, y_test))\n",
"print(rf_cls.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "simple-friendship",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0\n",
"0.9027237354085603\n",
"0.8910505836575876\n"
]
}
],
"source": [
"scale=1.0\n",
"fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)\n",
"X_train = np.asarray([mol2fp(m, fpgen) for m in data_train])\n",
"X_test = np.asarray([mol2fp(m, fpgen) for m in data_test])\n",
"sol_train = [float(m.GetProp('SOL')) for m in data_train]\n",
"delta_sol_train = [s-(-1) for s in sol_train]\n",
"delta_y_train = np.asarray([calc_delta_y(v, scale=scale) for v in delta_sol_train])\n",
"y_train = [np.argmax(y) for y in delta_y_train]\n",
"\n",
"sol_test = [float(m.GetProp('SOL')) for m in data_test]\n",
"delta_sol_test = [s-(-1) for s in sol_test]\n",
"delta_y_test = np.asanyarray([calc_delta_y(v, scale=scale) for v in delta_sol_test])\n",
"y_test = [np.argmax(y) for y in delta_y_test]\n",
"\n",
"\n",
"prf_cls = prf()\n",
"rf_cls = RandomForestClassifier(random_state=1)\n",
"prf_cls.fit(X_train, py=delta_y_train)\n",
"rf_cls.fit(X_train, y_train)\n",
"print(scale)\n",
"print(prf_cls.score(X_test, y_test))\n",
"print(rf_cls.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ideal-algorithm",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment