Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save smihael/60b101c0f04ba869da2fb345c6ae3aa3 to your computer and use it in GitHub Desktop.
Save smihael/60b101c0f04ba869da2fb345c6ae3aa3 to your computer and use it in GitHub Desktop.
Combining several imperfect classifiers to reduce overall classification error.ipynb
{
"cells": [
{
"cell_type": "markdown",
"id": "66a3c05e",
"metadata": {},
"source": [
"## Combining several imperfect classifiers to reduce overall classification error\n",
"\n",
"This notebook illustrates one of the basic concepts in ensemble learning.\n",
"\n",
"By combining classifiers that are trained on different subsets of the training data, it is possible to acheive superior classifier performance.\n",
"\n",
"Figure is adapted from an example published in:\n",
"- Robi Polikar. Ensemble based systems in decision making. Circuits and Systems Magazine, IEEE, 6(3):21–45, 2006.\n",
" Link to the original publication: https://doi.org/10.1109/MCAS.2006.1688199\n",
" \n",
"\n",
"Code author: [smihael](https://github.com/smihael)\n",
"\n",
"Licence: CC-BY-SA 4.0"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "34505114",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn import datasets\n",
"from sklearn.svm import SVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.ensemble import VotingClassifier\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.colors import ListedColormap\n",
"\n",
"from matplotlib.patches import Patch\n",
"from matplotlib.lines import Line2D"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "82d86921",
"metadata": {},
"outputs": [],
"source": [
"# Create a meshgrid for plotting\n",
"def create_meshgrid(X, h=.02):\n",
" x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
" y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
" xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
" return xx, yy"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "13ca2a30",
"metadata": {},
"outputs": [],
"source": [
"def plot_decision_boundaries(clf, X, y, xx, yy, Z, title, subplot, \n",
" X_sub=None,\n",
" color='lightgray',\n",
" marker_types=['^', 's', 'o'],\n",
" fill_areas=False,\n",
" plot_boundary=True,\n",
" show_legend=False):\n",
" \n",
" if fill_areas:\n",
" subplot.contourf(xx, yy, Z, alpha=0.1, cmap=ListedColormap(['lightgray','gray','darkgray']))\n",
" elif plot_boundary:\n",
" subplot.contour(xx, yy, Z, alpha=0.5, cmap=ListedColormap([color]))\n",
"\n",
" # Plot all points with gray color first\n",
" for idx, cl in enumerate(np.unique(y)):\n",
" subplot.scatter(x=X[y == cl, 0], y=X[y == cl, 1], c='0.5', marker=marker_types[idx],\n",
" label=f'Class {cl}', edgecolors='k', s=20, alpha=0.5)\n",
"\n",
" # Plot points from the training subset with specified color\n",
" if X_sub is not None:\n",
" for idx, cl in enumerate(np.unique(y)):\n",
" indices = np.where(np.logical_and(y == cl, np.isin(range(len(X)), X_sub)))[0]\n",
" if len(indices) > 0:\n",
" subplot.scatter(x=X[indices, 0], y=X[indices, 1], c=color, marker=marker_types[idx],\n",
" label=f'Class {cl} (subset)', edgecolors='k', s=20, alpha=1)\n",
"\n",
" subplot.set_xlim(xx.min(), xx.max())\n",
" subplot.set_ylim(yy.min(), yy.max())\n",
" subplot.set_xlabel('Feature 1')\n",
" subplot.set_ylabel('Feature 2')\n",
" subplot.set_title(title)\n",
" if show_legend:\n",
" subplot.legend()"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "b3113ea0",
"metadata": {},
"outputs": [],
"source": [
"# Create a synthetic dataset\n",
"# X, y = datasets.make_classification(n_samples=100, n_features=2, n_redundant=0, n_clusters_per_class=1, n_classes=3, random_state=5)\n",
"\n",
"# ... or use iris dataset\n",
"iris = datasets.load_iris()\n",
"X = iris.data[:, [0, 2]]\n",
"y = iris.target\n",
"\n",
"# Split the data into three different subsets\n",
"np.random.seed(2)\n",
"subset_indices_1 = np.random.choice(range(len(X)), size=int(len(X)/3), replace=False)\n",
"subset_indices_2 = np.random.choice(list(set(range(len(X))) - set(subset_indices_1)), size=int(len(X)/3), replace=False)\n",
"subset_indices_3 = list(set(range(len(X))) - set(subset_indices_1) - set(subset_indices_2))\n",
"\n",
"X_sub1, y_sub1 = X[subset_indices_1], y[subset_indices_1]\n",
"X_sub2, y_sub2 = X[subset_indices_2], y[subset_indices_2]\n",
"X_sub3, y_sub3 = X[subset_indices_3], y[subset_indices_3]\n",
"\n",
"# Define the base classifiers, trained on different subsets\n",
"clf=[None]*3\n",
"params={'kernel': 'rbf','random_state': 0}\n",
"clf[0] = SVC(**params).fit(X_sub1, y_sub1)\n",
"clf[1] = SVC(**params).fit(X_sub2, y_sub2)\n",
"clf[2] = SVC(**params).fit(X_sub3, y_sub3)\n",
"\n",
"# Define an ensemble of the three classifiers\n",
"ensemble = VotingClassifier(estimators=[('a', clf[0]), ('b', clf[1]), ('c', clf[2])], voting='hard')\n",
"ensemble.fit(X, y)\n",
"\n",
"# Ceate meshgrid\n",
"xx, yy = create_meshgrid(X)\n",
"\n",
"# Get decision boundaries \n",
"Z=[None]*3\n",
"for i in range(3):\n",
" Z[i] = clf[i].predict(np.c_[xx.ravel(), yy.ravel()])\n",
" Z[i] = Z[i].reshape(xx.shape)\n",
" \n",
" \n",
"Z_ensamble = ensemble.predict(np.c_[xx.ravel(), yy.ravel()])\n",
"Z_ensamble = Z_ensamble.reshape(xx.shape)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "a297c31c",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1080x720 with 6 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Plotting decision boundaries\n",
"fig, axs = plt.subplots(2, 3, figsize=(15, 10), gridspec_kw={'height_ratios': [1, 1], 'hspace': 0.75})\n",
"\n",
"# Define colors for different subsets\n",
"colors = ['blue', 'red', 'green']\n",
"\n",
"plot_decision_boundaries(clf[0], X, y, xx, yy, Z[0], 'Classifier 1', axs[0, 0], subset_indices_1, color=colors[0])\n",
"plot_decision_boundaries(clf[1], X, y, xx, yy, Z[1], 'Classifier 2', axs[0, 1], subset_indices_2, color=colors[1])\n",
"plot_decision_boundaries(clf[2], X, y, xx, yy, Z[2], 'Classifier 3', axs[0, 2], subset_indices_3, color=colors[2],fill_areas=True)\n",
"\n",
"# Plot all decision boundaries together\n",
"plot_decision_boundaries(ensemble, X, y, xx, yy, Z_ensamble, 'Different decision boundaries', axs[1, 0], plot_boundary=False)\n",
"d=[None]*3\n",
"for i in range(3):\n",
" d[i] = axs[1, 0].contour(xx, yy, Z[i], alpha=0.1, colors=colors[i])\n",
" #axs[1, 0].clabel(d[i], fmt=f'Classifier {i}', inline_spacing=10, fontsize=8)\n",
"\n",
"\n",
"# Plot ensamble\n",
"plot_decision_boundaries(ensemble, X, y, xx, yy, Z_ensamble, 'Ensemble and\\ncombined decision boundary', axs[1, 1])\n",
"\n",
"\n",
"# Clear the last subplot for the legend\n",
"axs[1, 2].clear()\n",
"axs[1, 2].axis('off')\n",
"\n",
"# Custom legend for the entire figure\n",
"legend_elements = [\n",
" Line2D([0], [0], color='w', markerfacecolor='lightgray', markeredgecolor='k', marker='^', markersize=10, label='Class 0'),\n",
" Line2D([0], [0], color='w', markerfacecolor='lightgray', markeredgecolor='k', marker='s', markersize=10, label='Class 1'),\n",
" Line2D([0], [0], color='w', markerfacecolor='lightgray', markeredgecolor='k', marker='o', markersize=10, label='Class 2'),\n",
" Line2D([0], [0], color='w', markerfacecolor=colors[0], marker='*', markersize=10, label='Classifier 1\\ntraining subset'), \n",
" Line2D([0], [0], color='w', markerfacecolor=colors[1], marker='*', markersize=10, label='Classifier 1\\ntraining subset'), \n",
" Line2D([0], [0], color='w', markerfacecolor=colors[2], marker='*', markersize=10, label='Classifier 1\\ntraining subset'), \n",
"\n",
" Line2D([0], [0], color=colors[0], lw=2, label='Classifier 1\\nboundary'),\n",
" Line2D([0], [0], color=colors[1], lw=2, label='Classifier 2\\nboundary'),\n",
" Line2D([0], [0], color=colors[2], lw=2, label='Classifier 3\\nboundary'),\n",
"]\n",
"axs[1, 2].legend(handles=legend_elements, loc='right')\n",
"\n",
"\n",
"# The second subplot in the second row should be positioned to make space for the horizontal arrow\n",
"axs[1, 1].set_position([0.5, axs[1,0].get_position().y0,\n",
" axs[0, 0].get_position().width,\n",
" axs[0, 0].get_position().height])\n",
"\n",
"\n",
"# Add arrows from the top row to the 4th subplot, shortening them so the heads are not overlapped (patchB)\n",
"\n",
"arrow_end_x = axs[1, 0].get_position().x0 + 0.5 * axs[1, 0].get_position().width\n",
"arrow_end_y = axs[1, 0].get_position().y0 + 0.5 * axs[1, 0].get_position().height\n",
"\n",
"for i in range(3):\n",
" axs[0, i].annotate(\"\", xy=(arrow_end_x, arrow_end_y), xycoords='figure fraction', \n",
" xytext=(0.5, -0.2), textcoords='axes fraction',\n",
" arrowprops=dict(arrowstyle=\"simple\",\n",
" color=colors[i],\n",
" patchB=axs[1,0],\n",
" shrinkB=20))\n",
"\n",
" \n",
"# Add a big horizontal arrow between the subplots in the second row\n",
"\n",
"arrow_start_x = axs[1, 0].get_position().x1 # x position where the fourth subplot ends\n",
"arrow_end_x = axs[1, 1].get_position().x0 # x position where the fifth subplot starts\n",
"\n",
"arrow_y = axs[1, 0].get_position().y0 + axs[1, 0].get_position().height/3\n",
"\n",
"ma=axs[1, 0].annotate(\"\", xy=(arrow_end_x-0.14, arrow_y), xycoords='figure fraction', \n",
" xytext=(arrow_start_x-0.07, arrow_y), textcoords='figure fraction',\n",
" arrowprops=dict(arrowstyle=\"simple,head_width=0.8,head_length=0.8\", color=\"0.5\", lw=5))\n",
"\n",
"# Let's display the plot to see the current output\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "06e37e04",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Classifier 1 accuracy: 0.93\n",
"Classifier 2 accuracy: 0.93\n",
"Classifier 3 accuracy: 0.95\n",
"Ensemble classifier accuracy: 0.96\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"\n",
"# We won't do the classical train/test split with train_test_split(X, y, test_size=0.3), \n",
"# as this is specific example, where we want to see graphically\n",
"# what training data went in each of the base classifiers and\n",
"# how the classifier performs on the overall dataset. \n",
"\n",
"X_test = X\n",
"y_test = y\n",
"\n",
"accuracies = []\n",
"\n",
"for i, classifier in enumerate(clf):\n",
" y_pred = classifier.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" accuracies.append(accuracy)\n",
" print(f\"Classifier {i+1} accuracy: {accuracy:.2f}\")\n",
"\n",
"# Print accuracy of the ensemble classifier\n",
"y_pred_ensemble = ensemble.predict(X_test)\n",
"accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)\n",
"print(f\"Ensemble classifier accuracy: {accuracy_ensemble:.2f}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid
# Define the search space
subset_seed_range = range(0, 100) # Range of seeds to try
param_grid = {
'C': [0.1, 1, 10],
'gamma': [0.001, 0.01, 0.1, 1],
'kernel': ['rbf'],
'random_state': range(0,5)
}
all_combinations = [(seed, params) for seed in subset_seed_range for params in ParameterGrid(param_grid)]
# Initialize variables to track the best performance
best_ensemble_accuracy = 0
best_combination = None
n_passed = 0
# Perform the search
for seed, params in all_combinations:
#print(seed)
#print(params)
# Create the subsets using the seed
np.random.seed(seed)
subset_indices_1 = np.random.choice(range(len(X)), size=int(len(X)/3), replace=False)
subset_indices_2 = np.random.choice(list(set(range(len(X))) - set(subset_indices_1)), size=int(len(X)/3), replace=False)
subset_indices_3 = list(set(range(len(X))) - set(subset_indices_1) - set(subset_indices_2))
X_sub1, y_sub1 = X[subset_indices_1], y[subset_indices_1]
X_sub2, y_sub2 = X[subset_indices_2], y[subset_indices_2]
X_sub3, y_sub3 = X[subset_indices_3], y[subset_indices_3]
# Train SVC models on each subset with the given parameters
classifiers = []
for i, (X_sub, y_sub) in enumerate([(X_sub1, y_sub1), (X_sub2, y_sub2), (X_sub3, y_sub3)]):
clf = SVC(**params).fit(X_sub, y_sub)
classifiers.append(clf)
# Create the ensemble
ensemble = VotingClassifier(estimators=[(f'clf{i}', clf) for i, clf in enumerate(classifiers)], voting='hard')
ensemble.fit(X, y)
# Evaluate the ensemble and base classifiers
ensemble_accuracy = accuracy_score(y, ensemble.predict(X))
base_accuracies = [accuracy_score(y, clf.predict(X)) for clf in classifiers]
# We don't want 'bad' examples
passed = all(acc < ensemble_accuracy for acc in base_accuracies)
if passed:
n_passed = n_passed + 1
# Check if this combination is the best so far
if ensemble_accuracy > best_ensemble_accuracy and passed:
best_ensemble_accuracy = ensemble_accuracy
best_combination = (seed, params)
#print("----")
# Output the results
print(f"Best ensemble accuracy: {best_ensemble_accuracy:.2f}")
print(f"Best combination: Seed={best_combination[0]}, Parameters={best_combination[1]}")
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment