Created
June 5, 2019 13:00
-
-
Save rebeccabilbro/bf8532521f22234d0ef820e372fe9936 to your computer and use it in GitHub Desktop.
Produce customizable classifier comparison plots
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# plot_classifier_comparison.py | |
""" | |
A comparison of a several classifiers in scikit-learn on synthetic datasets. | |
The point of this example is to illustrate the nature of decision boundaries | |
of different classifiers. | |
Particularly in high-dimensional spaces, data can more easily be separated | |
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs | |
might lead to better generalization than is achieved by other classifiers. | |
The plots show training points in solid colors and testing points | |
semi-transparent. The lower right shows the classification accuracy on the test | |
set. | |
Original code source: Gaël Varoquaux and Andreas Müller | |
Modified for documentation by Jaques Grobler | |
License: BSD 3 clause | |
""" | |
############################################################################### | |
# Imports | |
############################################################################### | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.svm import SVC | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.ensemble import BaggingClassifier | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import AdaBoostClassifier | |
from sklearn.gaussian_process.kernels import RBF | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.neural_network import MLPClassifier | |
from sklearn.linear_model import RidgeClassifier | |
from sklearn.ensemble import ExtraTreesClassifier | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import GradientBoostingClassifier | |
from sklearn.model_selection import train_test_split as tts | |
from sklearn.linear_model import PassiveAggressiveClassifier | |
from sklearn.gaussian_process import GaussianProcessClassifier | |
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis | |
from sklearn.datasets import make_moons, make_circles, make_classification | |
############################################################################### | |
# Model Families | |
############################################################################### | |
rbfs = { | |
"Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)), | |
"RBF SVM": SVC(gamma=2, C=1) | |
} | |
parametrics = { | |
"Logistic": LogisticRegression(solver='lbfgs'), | |
"Ridge": RidgeClassifier(), | |
"SGD": SGDClassifier(), | |
"Perceptron": MLPClassifier(alpha=1, max_iter=1000), | |
"Passive Aggressive": PassiveAggressiveClassifier(), | |
"Naive Bayes": GaussianNB(), | |
"QDA": QuadraticDiscriminantAnalysis() | |
} | |
ensembles = { | |
"Decision Tree": DecisionTreeClassifier(max_depth=5), | |
"Extra Trees": ExtraTreesClassifier(max_depth=5, n_estimators=10), | |
"Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), | |
"Adaboost": AdaBoostClassifier(n_estimators=10), | |
"Gradient Boosting": GradientBoostingClassifier(n_estimators=30,max_depth=5), | |
"Bagging": BaggingClassifier(n_estimators=10) | |
} | |
neighbors = { | |
"kNN Minkowski": KNeighborsClassifier(3, metric='minkowski'), | |
"kNN Euclidean": KNeighborsClassifier(3, metric='euclidean'), | |
"kNN Manhattan": KNeighborsClassifier(3, metric='manhattan'), | |
"kNN Chebyshev": KNeighborsClassifier(3, metric='chebyshev') | |
} | |
############################################################################### | |
# Plotter Function | |
############################################################################### | |
def linearly_separable_data(): | |
""" | |
Produce a linearly separable dataset | |
""" | |
X, y = make_classification( | |
n_features=2, n_redundant=0, n_informative=2, | |
random_state=1, n_clusters_per_class=1 | |
) | |
rng = np.random.RandomState(2) | |
X += 2 * rng.uniform(size=X.shape) | |
return (X, y) | |
def compare_classifiers(classifiers, point_colors, colormap): | |
""" | |
Given a mapping of classifier names to scikit-learn estimators, as well | |
as an optional colormap argument, produce a custom classifier comparison | |
plot. | |
""" | |
datasets = [ | |
linearly_separable_data(), | |
make_moons(noise=0.3, random_state=0), | |
make_circles(noise=0.2, factor=0.5, random_state=1), | |
] | |
_ = plt.figure(figsize=(27, 9)) | |
plt_idx = 1 | |
# Iterate over datasets | |
for dataset_idx, dataset in enumerate(datasets): | |
# Preprocess data | |
X, y = dataset | |
X = StandardScaler().fit_transform(X) | |
# Create train and test splits | |
X_train, X_test, y_train, y_test = tts( | |
X, y, test_size=.4, random_state=42 | |
) | |
# Create the plot range | |
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 | |
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 | |
xx, yy = np.meshgrid( | |
np.arange(x_min, x_max, .02), | |
np.arange(y_min, y_max, .02) | |
) | |
# Plot the data by itself | |
ax = plt.subplot(len(datasets), len(classifiers) + 1, plt_idx) | |
if dataset_idx == 0: | |
ax.set_title("Input data") | |
# Plot the training points | |
ax.scatter( | |
X_train[:, 0], | |
X_train[:, 1], | |
c=y_train, | |
edgecolors='k', | |
cmap=point_colors | |
) | |
# Plot the testing points | |
ax.scatter( | |
X_test[:, 0], | |
X_test[:, 1], | |
c=y_test, | |
alpha=0.6, | |
edgecolors='k', | |
cmap=point_colors | |
) | |
ax.set_xlim(xx.min(), xx.max()) | |
ax.set_ylim(yy.min(), yy.max()) | |
ax.set_xticks(()) | |
ax.set_yticks(()) | |
plt_idx += 1 | |
# Iterate over classifiers | |
for name, clf in classifiers.items(): | |
ax = plt.subplot(len(datasets), len(classifiers) + 1, plt_idx) | |
clf.fit(X_train, y_train) | |
score = clf.score(X_test, y_test) | |
# Plot the decision boundary. For that, we will assign a color to each | |
# point in the mesh [x_min, x_max]x[y_min, y_max]. | |
if hasattr(clf, "decision_function"): | |
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) | |
else: | |
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] | |
# Put the result into a color plot | |
Z = Z.reshape(xx.shape) | |
ax.contourf(xx, yy, Z, cmap=colormap, alpha=.8) | |
# Plot the training points | |
ax.scatter( | |
X_train[:, 0], | |
X_train[:, 1], | |
c=y_train, | |
edgecolors='k', | |
cmap=point_colors | |
) | |
# Plot the testing points | |
ax.scatter( | |
X_test[:, 0], | |
X_test[:, 1], | |
c=y_test, | |
alpha=0.6, | |
edgecolors='k', | |
cmap=point_colors | |
) | |
ax.set_xlim(xx.min(), xx.max()) | |
ax.set_ylim(yy.min(), yy.max()) | |
ax.set_xticks(()) | |
ax.set_yticks(()) | |
if dataset_idx == 0: | |
ax.set_title(name) | |
# Position the axis label text | |
ax.text( | |
xx.max() - .3, | |
yy.min() + .3, | |
('%.2f' % score).lstrip('0'), | |
horizontalalignment='right', | |
size=15 | |
) | |
plt_idx += 1 | |
plt.tight_layout() | |
plt.show() | |
if __name__ == "__main__": | |
from matplotlib.colors import ListedColormap | |
# From https://matplotlib.org/2.0.2/examples/color/colormaps_reference.html | |
decision_colors = plt.cm.viridis | |
train_test_colors = ListedColormap(['#4d06a3','#8ca803']) | |
compare_classifiers(neighbors, train_test_colors, decision_colors) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment