Skip to content

Instantly share code, notes, and snippets.

@iwatobipen
Created November 28, 2019 02:25
Show Gist options
  • Save iwatobipen/8d865c617b63c8775245f491af379edf to your computer and use it in GitHub Desktop.
Save iwatobipen/8d865c617b63c8775245f491af379edf to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"RDKit WARNING: [11:24:44] Enabling RDKit 2019.09.1 jupyter extensions\n"
]
}
],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"from rdkit import Chem\n",
"from rdkit.Chem import DataStructs\n",
"from rdkit.Chem import AllChem\n",
"from rdkit import RDPaths\n",
"from rdkit.Chem.Draw import IPythonConsole\n",
"from rdkit.Chem import Draw\n",
"from rdkit.Chem import PandasTools\n",
"import numpy as np\n",
"import pandas as pd\n",
"from IPython.display import HTML"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"traindf = PandasTools.LoadSDF(os.path.join(RDPaths.RDDocsDir,'Book/data/solubility.train.sdf'))\n",
"testdf = PandasTools.LoadSDF(os.path.join(RDPaths.RDDocsDir, 'Book/data/solubility.test.sdf'))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>NAME</th>\n",
" <th>SOL</th>\n",
" <th>SOL_classification</th>\n",
" <th>smiles</th>\n",
" <th>ROMol</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>n-pentane</td>\n",
" <td>n-pentane</td>\n",
" <td>-3.18</td>\n",
" <td>(A) low</td>\n",
" <td>CCCCC</td>\n",
" <td><img data-content=\"rdkit/molecule\" src=\"\" alt=\"Mol\"/></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>cyclopentane</td>\n",
" <td>cyclopentane</td>\n",
" <td>-2.64</td>\n",
" <td>(B) medium</td>\n",
" <td>C1CCCC1</td>\n",
" <td><img data-content=\"rdkit/molecule\" src=\"\" alt=\"Mol\"/></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HTML(traindf.head(2).to_html())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"cls2lab = {'(A) low':0, '(B) medium':1, '(C) high':2}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def fp2np(fp):\n",
" arr = np.zeros((0,))\n",
" DataStructs.ConvertToNumpyArray(fp, arr)\n",
" return arr"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"trainfp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in traindf.ROMol]\n",
"testfp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in testdf.ROMol]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"trainX = np.array([fp2np(fp) for fp in trainfp])\n",
"testX = np.array([fp2np(fp) for fp in testfp])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"trainY = np.array([cls2lab[i] for i in traindf.SOL_classification.to_list()])\n",
"testY = np.array([cls2lab[i] for i in testdf.SOL_classification.to_list()])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[MLENS] backend: threading\n"
]
}
],
"source": [
"from mlens.ensemble import SuperLearner\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.metrics import r2_score, accuracy_score\n",
"from sklearn.svm import SVR, SVC"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Base model"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7198443579766537"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rf = RandomForestClassifier(n_estimators=100, random_state=794)\n",
"rf.fit(trainX, trainY)\n",
"pred = rf.predict(testX)\n",
"accuracy_score(testY, pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SuperLearner is stacking model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SuperLearner(array_check=None, backend=None, folds=2,\n",
" layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,\n",
" name='layer-1', propagate_features=None, raise_on_exception=True,\n",
" random_state=3251, shuffle=False,\n",
" stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,\n",
" indexer=FoldIndex(X=None, folds=2, raise_on_ex...81782f0>)],\n",
" n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],\n",
" verbose=1)],\n",
" model_selection=False, n_jobs=None, raise_on_exception=True,\n",
" random_state=794, sample_size=20,\n",
" scorer=<function accuracy_score at 0x7f61481782f0>, shuffle=False,\n",
" verbose=2)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble = SuperLearner(scorer=accuracy_score, random_state=794, verbose=2)\n",
"ensemble.add([RandomForestClassifier(n_estimators=100, random_state=794), SVC(gamma='auto', C=1000)])\n",
"ensemble.add_meta(LogisticRegression(solver='lbfgs', multi_class='auto'))\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Fitting 2 layers\n",
"Processing layer-1 done | 00:00:01\n",
"Processing layer-2 done | 00:00:00\n",
"Fit complete | 00:00:01\n",
"\n",
"Predicting 2 layers\n",
"Processing layer-1 done | 00:00:00\n",
"Processing layer-2 done | 00:00:00\n",
"Predict complete | 00:00:00\n"
]
},
{
"data": {
"text/plain": [
"0.7159533073929961"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble.fit(trainX, trainY)\n",
"pred = ensemble.predict(testX)\n",
"accuracy_score(testY, pred)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" score-m score-s ft-m ft-s pt-m pt-s\n",
"layer-1 randomforestclassifier 0.55 0.01 0.37 0.01 0.03 0.00\n",
"layer-1 svc 0.56 0.03 0.60 0.10 0.40 0.02"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble.data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Blending approaches"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from mlens.ensemble import BlendEnsemble"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BlendEnsemble(array_check=None, backend=None,\n",
" layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,\n",
" name='layer-1', propagate_features=None, raise_on_exception=True,\n",
" random_state=None, shuffle=False,\n",
" stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,\n",
" indexer=BlendIndex(X=None, raise_on_exception=...81782f0>)],\n",
" n_jobs=-1, name='group-3', raise_on_exception=True, transformers=[])],\n",
" verbose=1)],\n",
" model_selection=False, n_jobs=None, raise_on_exception=True,\n",
" random_state=None, sample_size=20,\n",
" scorer=<function accuracy_score at 0x7f61481782f0>, shuffle=False,\n",
" test_size=0.2, verbose=2)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble2 = BlendEnsemble(scorer=accuracy_score, test_size=0.2, verbose=2)\n",
"ensemble2.add([RandomForestClassifier(n_estimators=794, random_state=794),\n",
" SVC(gamma='auto')])\n",
"ensemble2.add_meta(LogisticRegression(solver='lbfgs', multi_class='auto'))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Fitting 2 layers\n",
"Processing layer-1 done | 00:00:03\n",
"Processing layer-2 done | 00:00:00\n",
"Fit complete | 00:00:03\n"
]
},
{
"data": {
"text/plain": [
"BlendEnsemble(array_check=None, backend=None,\n",
" layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,\n",
" name='layer-1', propagate_features=None, raise_on_exception=True,\n",
" random_state=None, shuffle=False,\n",
" stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,\n",
" indexer=BlendIndex(X=None, raise_on_exception=...81782f0>)],\n",
" n_jobs=-1, name='group-3', raise_on_exception=True, transformers=[])],\n",
" verbose=1)],\n",
" model_selection=False, n_jobs=None, raise_on_exception=True,\n",
" random_state=None, sample_size=20,\n",
" scorer=<function accuracy_score at 0x7f61481782f0>, shuffle=False,\n",
" test_size=0.2, verbose=2)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble2.fit(trainX, trainY)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Predicting 2 layers\n",
"Processing layer-1 done | 00:00:00\n",
"Processing layer-2 done | 00:00:00\n",
"Predict complete | 00:00:00\n"
]
}
],
"source": [
"pred_b = ensemble2.predict(testX)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" score-m score-s ft-m ft-s pt-m pt-s\n",
"layer-1 randomforestclassifier 0.60 0.00 3.21 0.00 0.08 0.00\n",
"layer-1 svc 0.38 0.00 1.72 0.00 0.39 0.00"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble2.data"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.669260700389105"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_score(pred_b, testY)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@iwatobipen
Copy link
Author

Example of ensemble learning with ml-ens

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment