Created
May 27, 2020 13:19
-
-
Save iwatobipen/1c8d4c2480a4e9afc42105b17ffe1495 to your computer and use it in GitHub Desktop.
aggregate conformal prediction example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"import os\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from rdkit import Chem\n", | |
"from rdkit.Chem import RDConfig\n", | |
"from rdkit.Chem import DataStructs\n", | |
"from rdkit.Chem import AllChem\n", | |
"from rdkit.Chem.Draw import IPythonConsole\n", | |
"from rdkit.Chem import Draw\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from nonconformist.nc import ClassifierNc\n", | |
"from nonconformist.nc import ClassifierAdapter\n", | |
"from nonconformist.icp import IcpClassifier\n", | |
"from nonconformist.acp import AggregatedCp\n", | |
"from nonconformist.acp import BootstrapConformalClassifier\n", | |
"from nonconformist.acp import BootstrapSampler\n", | |
"from nonconformist.evaluation import ClassIcpCvHelper\n", | |
"from nonconformist.evaluation import cross_val_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train = os.path.join(RDConfig.RDDocsDir, 'Book/data/solubility.train.sdf')\n", | |
"test = os.path.join(RDConfig.RDDocsDir, 'Book/data/solubility.test.sdf')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"trainmol = [m for m in Chem.SDMolSupplier(train)]\n", | |
"testmol = [m for m in Chem.SDMolSupplier(test)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'(B) medium', '(A) low', '(C) high'}\n" | |
] | |
} | |
], | |
"source": [ | |
"labels = set([m.GetProp('SOL_classification') for m in trainmol])\n", | |
"print(labels)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"label2cls = {'(A) low':0, '(B) medium':1, '(C) high':2}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def fp2arr(fp):\n", | |
" arr = np.zeros((1,))\n", | |
" DataStructs.ConvertToNumpyArray(fp, arr)\n", | |
" return arr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"trainfps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, 1024) for m in trainmol]\n", | |
"trainfps = np.array([fp2arr(fp) for fp in trainfps])\n", | |
"\n", | |
"testfps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, 1024) for m in testmol]\n", | |
"testfps = np.array([fp2arr(fp) for fp in testfps])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(1025, 1024) (1025,) (257, 1024) (257,)\n" | |
] | |
} | |
], | |
"source": [ | |
"train_cls = [label2cls[m.GetProp('SOL_classification')] for m in trainmol]\n", | |
"train_cls = np.array(train_cls)\n", | |
"test_cls = [label2cls[m.GetProp('SOL_classification')] for m in testmol]\n", | |
"test_cls = np.array(test_cls)\n", | |
"print(trainfps.shape, train_cls.shape, testfps.shape, test_cls.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"'''\n", | |
"#train data is devided to train and calibration data\n", | |
"ids = np.random.permutation(train_cls.size)\n", | |
"# Use first 700 data for train and second set is used for calibration\n", | |
"trainX, calibX = trainfps[ids[:700],:],trainfps[ids[700:],:] \n", | |
"trainY, calibY = train_cls[ids[:700]],train_cls[ids[700:]] \n", | |
"'''\n", | |
"trainX = trainfps\n", | |
"trainY = train_cls" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"testX = testfps\n", | |
"testY = test_cls" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rf = RandomForestClassifier(n_estimators=500, random_state=794)\n", | |
"nc = ClassifierNc(ClassifierAdapter(rf))\n", | |
"icp = IcpClassifier(nc)\n", | |
"bcp = BootstrapConformalClassifier(icp)\n", | |
"\n", | |
"rf2 = RandomForestClassifier(n_estimators=500, random_state=794)\n", | |
"nc2 = ClassifierNc(ClassifierAdapter(rf2))\n", | |
"icp2 = IcpClassifier(nc2)\n", | |
"acp = AggregatedCp(icp2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"bcp.fit(trainX, trainY)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"acp.fit(trainX, trainY)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"bcp_pred95 = bcp.predict(testX, significance=0.05).astype(np.int32)\n", | |
"bcp_pred80 = bcp.predict(testX, significance=0.2).astype(np.int32)\n", | |
"\n", | |
"acp_pred95 = acp.predict(testX, significance=0.05).astype(np.int32)\n", | |
"acp_pred80 = acp.predict(testX, significance=0.2).astype(np.int32)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from nonconformist.evaluation import class_avg_c, class_n_correct, class_mean_errors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"252\n", | |
"219\n" | |
] | |
} | |
], | |
"source": [ | |
"print(class_n_correct(acp_pred95, testY, significance=0.05))\n", | |
"print(class_n_correct(acp_pred80, testY, significance=0.2))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"252\n", | |
"220\n" | |
] | |
} | |
], | |
"source": [ | |
"print(class_n_correct(bcp_pred95, testY, significance=0.05))\n", | |
"print(class_n_correct(bcp_pred80, testY, significance=0.2))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment