Skip to content

Instantly share code, notes, and snippets.

@iwatobipen
Created May 27, 2020 13:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iwatobipen/1c8d4c2480a4e9afc42105b17ffe1495 to your computer and use it in GitHub Desktop.
Save iwatobipen/1c8d4c2480a4e9afc42105b17ffe1495 to your computer and use it in GitHub Desktop.
aggregate conformal prediction example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"from rdkit import Chem\n",
"from rdkit.Chem import RDConfig\n",
"from rdkit.Chem import DataStructs\n",
"from rdkit.Chem import AllChem\n",
"from rdkit.Chem.Draw import IPythonConsole\n",
"from rdkit.Chem import Draw\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from nonconformist.nc import ClassifierNc\n",
"from nonconformist.nc import ClassifierAdapter\n",
"from nonconformist.icp import IcpClassifier\n",
"from nonconformist.acp import AggregatedCp\n",
"from nonconformist.acp import BootstrapConformalClassifier\n",
"from nonconformist.acp import BootstrapSampler\n",
"from nonconformist.evaluation import ClassIcpCvHelper\n",
"from nonconformist.evaluation import cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train = os.path.join(RDConfig.RDDocsDir, 'Book/data/solubility.train.sdf')\n",
"test = os.path.join(RDConfig.RDDocsDir, 'Book/data/solubility.test.sdf')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"trainmol = [m for m in Chem.SDMolSupplier(train)]\n",
"testmol = [m for m in Chem.SDMolSupplier(test)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'(B) medium', '(A) low', '(C) high'}\n"
]
}
],
"source": [
"labels = set([m.GetProp('SOL_classification') for m in trainmol])\n",
"print(labels)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"label2cls = {'(A) low':0, '(B) medium':1, '(C) high':2}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def fp2arr(fp):\n",
" arr = np.zeros((1,))\n",
" DataStructs.ConvertToNumpyArray(fp, arr)\n",
" return arr"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"trainfps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, 1024) for m in trainmol]\n",
"trainfps = np.array([fp2arr(fp) for fp in trainfps])\n",
"\n",
"testfps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, 1024) for m in testmol]\n",
"testfps = np.array([fp2arr(fp) for fp in testfps])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1025, 1024) (1025,) (257, 1024) (257,)\n"
]
}
],
"source": [
"train_cls = [label2cls[m.GetProp('SOL_classification')] for m in trainmol]\n",
"train_cls = np.array(train_cls)\n",
"test_cls = [label2cls[m.GetProp('SOL_classification')] for m in testmol]\n",
"test_cls = np.array(test_cls)\n",
"print(trainfps.shape, train_cls.shape, testfps.shape, test_cls.shape)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"#train data is devided to train and calibration data\n",
"ids = np.random.permutation(train_cls.size)\n",
"# Use first 700 data for train and second set is used for calibration\n",
"trainX, calibX = trainfps[ids[:700],:],trainfps[ids[700:],:] \n",
"trainY, calibY = train_cls[ids[:700]],train_cls[ids[700:]] \n",
"'''\n",
"trainX = trainfps\n",
"trainY = train_cls"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"testX = testfps\n",
"testY = test_cls"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"rf = RandomForestClassifier(n_estimators=500, random_state=794)\n",
"nc = ClassifierNc(ClassifierAdapter(rf))\n",
"icp = IcpClassifier(nc)\n",
"bcp = BootstrapConformalClassifier(icp)\n",
"\n",
"rf2 = RandomForestClassifier(n_estimators=500, random_state=794)\n",
"nc2 = ClassifierNc(ClassifierAdapter(rf2))\n",
"icp2 = IcpClassifier(nc2)\n",
"acp = AggregatedCp(icp2)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"bcp.fit(trainX, trainY)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"acp.fit(trainX, trainY)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"\n",
"bcp_pred95 = bcp.predict(testX, significance=0.05).astype(np.int32)\n",
"bcp_pred80 = bcp.predict(testX, significance=0.2).astype(np.int32)\n",
"\n",
"acp_pred95 = acp.predict(testX, significance=0.05).astype(np.int32)\n",
"acp_pred80 = acp.predict(testX, significance=0.2).astype(np.int32)\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from nonconformist.evaluation import class_avg_c, class_n_correct, class_mean_errors"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"252\n",
"219\n"
]
}
],
"source": [
"print(class_n_correct(acp_pred95, testY, significance=0.05))\n",
"print(class_n_correct(acp_pred80, testY, significance=0.2))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"252\n",
"220\n"
]
}
],
"source": [
"print(class_n_correct(bcp_pred95, testY, significance=0.05))\n",
"print(class_n_correct(bcp_pred80, testY, significance=0.2))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment