Skip to content

Instantly share code, notes, and snippets.

@rebeccabilbro
Created June 5, 2019 13:00
Show Gist options
  • Save rebeccabilbro/bf8532521f22234d0ef820e372fe9936 to your computer and use it in GitHub Desktop.
Save rebeccabilbro/bf8532521f22234d0ef820e372fe9936 to your computer and use it in GitHub Desktop.
Produce customizable classifier comparison plots
#!/usr/bin/python
# -*- coding: utf-8 -*-
# plot_classifier_comparison.py
"""
A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.
Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.
The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.
Original code source: Gaël Varoquaux and Andreas Müller
Modified for documentation by Jaques Grobler
License: BSD 3 clause
"""
###############################################################################
# Imports
###############################################################################
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_moons, make_circles, make_classification
###############################################################################
# Model Families
###############################################################################
rbfs = {
"Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
"RBF SVM": SVC(gamma=2, C=1)
}
parametrics = {
"Logistic": LogisticRegression(solver='lbfgs'),
"Ridge": RidgeClassifier(),
"SGD": SGDClassifier(),
"Perceptron": MLPClassifier(alpha=1, max_iter=1000),
"Passive Aggressive": PassiveAggressiveClassifier(),
"Naive Bayes": GaussianNB(),
"QDA": QuadraticDiscriminantAnalysis()
}
ensembles = {
"Decision Tree": DecisionTreeClassifier(max_depth=5),
"Extra Trees": ExtraTreesClassifier(max_depth=5, n_estimators=10),
"Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
"Adaboost": AdaBoostClassifier(n_estimators=10),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=30,max_depth=5),
"Bagging": BaggingClassifier(n_estimators=10)
}
neighbors = {
"kNN Minkowski": KNeighborsClassifier(3, metric='minkowski'),
"kNN Euclidean": KNeighborsClassifier(3, metric='euclidean'),
"kNN Manhattan": KNeighborsClassifier(3, metric='manhattan'),
"kNN Chebyshev": KNeighborsClassifier(3, metric='chebyshev')
}
###############################################################################
# Plotter Function
###############################################################################
def linearly_separable_data():
"""
Produce a linearly separable dataset
"""
X, y = make_classification(
n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1
)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
return (X, y)
def compare_classifiers(classifiers, point_colors, colormap):
"""
Given a mapping of classifier names to scikit-learn estimators, as well
as an optional colormap argument, produce a custom classifier comparison
plot.
"""
datasets = [
linearly_separable_data(),
make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
]
_ = plt.figure(figsize=(27, 9))
plt_idx = 1
# Iterate over datasets
for dataset_idx, dataset in enumerate(datasets):
# Preprocess data
X, y = dataset
X = StandardScaler().fit_transform(X)
# Create train and test splits
X_train, X_test, y_train, y_test = tts(
X, y, test_size=.4, random_state=42
)
# Create the plot range
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(
np.arange(x_min, x_max, .02),
np.arange(y_min, y_max, .02)
)
# Plot the data by itself
ax = plt.subplot(len(datasets), len(classifiers) + 1, plt_idx)
if dataset_idx == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(
X_train[:, 0],
X_train[:, 1],
c=y_train,
edgecolors='k',
cmap=point_colors
)
# Plot the testing points
ax.scatter(
X_test[:, 0],
X_test[:, 1],
c=y_test,
alpha=0.6,
edgecolors='k',
cmap=point_colors
)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
plt_idx += 1
# Iterate over classifiers
for name, clf in classifiers.items():
ax = plt.subplot(len(datasets), len(classifiers) + 1, plt_idx)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=colormap, alpha=.8)
# Plot the training points
ax.scatter(
X_train[:, 0],
X_train[:, 1],
c=y_train,
edgecolors='k',
cmap=point_colors
)
# Plot the testing points
ax.scatter(
X_test[:, 0],
X_test[:, 1],
c=y_test,
alpha=0.6,
edgecolors='k',
cmap=point_colors
)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if dataset_idx == 0:
ax.set_title(name)
# Position the axis label text
ax.text(
xx.max() - .3,
yy.min() + .3,
('%.2f' % score).lstrip('0'),
horizontalalignment='right',
size=15
)
plt_idx += 1
plt.tight_layout()
plt.show()
if __name__ == "__main__":
from matplotlib.colors import ListedColormap
# From https://matplotlib.org/2.0.2/examples/color/colormaps_reference.html
decision_colors = plt.cm.viridis
train_test_colors = ListedColormap(['#4d06a3','#8ca803'])
compare_classifiers(neighbors, train_test_colors, decision_colors)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment