rebeccabilbro/classifier_comparison.py

## classifier_comparison.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# plot_classifier_comparison.py
"""
A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.

Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.

The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.

Original code source: Gaël Varoquaux and Andreas Müller
Modified for documentation by Jaques Grobler
License: BSD 3 clause
"""

###############################################################################
# Imports
###############################################################################
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_moons, make_circles, make_classification


###############################################################################
# Model Families
###############################################################################

rbfs = {
    "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    "RBF SVM": SVC(gamma=2, C=1)
}

parametrics = {
    "Logistic": LogisticRegression(solver='lbfgs'),
    "Ridge": RidgeClassifier(),
    "SGD": SGDClassifier(),
    "Perceptron": MLPClassifier(alpha=1, max_iter=1000),
    "Passive Aggressive": PassiveAggressiveClassifier(),
    "Naive Bayes": GaussianNB(),
    "QDA": QuadraticDiscriminantAnalysis()
}

ensembles = {
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Extra Trees": ExtraTreesClassifier(max_depth=5, n_estimators=10),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Adaboost": AdaBoostClassifier(n_estimators=10),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=30,max_depth=5),
    "Bagging": BaggingClassifier(n_estimators=10)
}

neighbors = {
    "kNN Minkowski": KNeighborsClassifier(3, metric='minkowski'),
    "kNN Euclidean": KNeighborsClassifier(3, metric='euclidean'),
    "kNN Manhattan": KNeighborsClassifier(3, metric='manhattan'),
    "kNN Chebyshev": KNeighborsClassifier(3, metric='chebyshev')
}

###############################################################################
# Plotter Function
###############################################################################

def linearly_separable_data():
    """
    Produce a linearly separable dataset
    """
    X, y = make_classification(
        n_features=2, n_redundant=0, n_informative=2,
        random_state=1, n_clusters_per_class=1
    )

    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)

    return (X, y)

def compare_classifiers(classifiers, point_colors, colormap):
    """
    Given a mapping of classifier names to scikit-learn estimators, as well
    as an optional colormap argument, produce a custom classifier comparison
    plot.
    """
    datasets = [
        linearly_separable_data(),
        make_moons(noise=0.3, random_state=0),
        make_circles(noise=0.2, factor=0.5, random_state=1),

    ]

    _ = plt.figure(figsize=(27, 9))

    plt_idx = 1

    # Iterate over datasets
    for dataset_idx, dataset in enumerate(datasets):

        # Preprocess data
        X, y = dataset
        X = StandardScaler().fit_transform(X)

        # Create train and test splits
        X_train, X_test, y_train, y_test = tts(
            X, y, test_size=.4, random_state=42
        )

        # Create the plot range
        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(
            np.arange(x_min, x_max, .02),
            np.arange(y_min, y_max, .02)
        )

        # Plot the data by itself
        ax = plt.subplot(len(datasets), len(classifiers) + 1, plt_idx)

        if dataset_idx == 0:
            ax.set_title("Input data")

        # Plot the training points
        ax.scatter(
            X_train[:, 0],
            X_train[:, 1],
            c=y_train,
            edgecolors='k',
            cmap=point_colors
        )

        # Plot the testing points
        ax.scatter(
            X_test[:, 0],
            X_test[:, 1],
            c=y_test,
            alpha=0.6,
            edgecolors='k',
            cmap=point_colors
        )
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        plt_idx += 1

        # Iterate over classifiers
        for name, clf in classifiers.items():
            ax = plt.subplot(len(datasets), len(classifiers) + 1, plt_idx)
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)

            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, x_max]x[y_min, y_max].
            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=colormap, alpha=.8)

            # Plot the training points
            ax.scatter(
                X_train[:, 0],
                X_train[:, 1],
                c=y_train,
                edgecolors='k',
                cmap=point_colors
            )

            # Plot the testing points
            ax.scatter(
                X_test[:, 0],
                X_test[:, 1],
                c=y_test,
                alpha=0.6,
                edgecolors='k',
                cmap=point_colors
            )
            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())

            if dataset_idx == 0:
                ax.set_title(name)

            # Position the axis label text
            ax.text(
                xx.max() - .3,
                yy.min() + .3,
                ('%.2f' % score).lstrip('0'),
                horizontalalignment='right',
                size=15
                )
            plt_idx += 1

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    from matplotlib.colors import ListedColormap

    # From https://matplotlib.org/2.0.2/examples/color/colormaps_reference.html
    decision_colors = plt.cm.viridis
    train_test_colors = ListedColormap(['#4d06a3','#8ca803'])

    compare_classifiers(neighbors, train_test_colors, decision_colors)
	#!/usr/bin/python
	# -- coding: utf-8 --
	# plot_classifier_comparison.py
	"""
	A comparison of a several classifiers in scikit-learn on synthetic datasets.
	The point of this example is to illustrate the nature of decision boundaries
	of different classifiers.

	Particularly in high-dimensional spaces, data can more easily be separated
	linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
	might lead to better generalization than is achieved by other classifiers.

	The plots show training points in solid colors and testing points
	semi-transparent. The lower right shows the classification accuracy on the test
	set.

	Original code source: Gaël Varoquaux and Andreas Müller
	Modified for documentation by Jaques Grobler
	License: BSD 3 clause
	"""

	###############################################################################
	# Imports
	###############################################################################
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.svm import SVC
	from sklearn.naive_bayes import GaussianNB
	from sklearn.ensemble import BaggingClassifier
	from sklearn.linear_model import SGDClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import AdaBoostClassifier
	from sklearn.gaussian_process.kernels import RBF
	from sklearn.preprocessing import StandardScaler
	from sklearn.neural_network import MLPClassifier
	from sklearn.linear_model import RidgeClassifier

	from sklearn.ensemble import ExtraTreesClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.ensemble import GradientBoostingClassifier
	from sklearn.model_selection import train_test_split as tts
	from sklearn.linear_model import PassiveAggressiveClassifier
	from sklearn.gaussian_process import GaussianProcessClassifier
	from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
	from sklearn.datasets import make_moons, make_circles, make_classification


	###############################################################################
	# Model Families
	###############################################################################

	rbfs = {
	"Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
	"RBF SVM": SVC(gamma=2, C=1)
	}

	parametrics = {
	"Logistic": LogisticRegression(solver='lbfgs'),
	"Ridge": RidgeClassifier(),
	"SGD": SGDClassifier(),
	"Perceptron": MLPClassifier(alpha=1, max_iter=1000),
	"Passive Aggressive": PassiveAggressiveClassifier(),
	"Naive Bayes": GaussianNB(),
	"QDA": QuadraticDiscriminantAnalysis()
	}

	ensembles = {
	"Decision Tree": DecisionTreeClassifier(max_depth=5),
	"Extra Trees": ExtraTreesClassifier(max_depth=5, n_estimators=10),
	"Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
	"Adaboost": AdaBoostClassifier(n_estimators=10),
	"Gradient Boosting": GradientBoostingClassifier(n_estimators=30,max_depth=5),
	"Bagging": BaggingClassifier(n_estimators=10)
	}

	neighbors = {
	"kNN Minkowski": KNeighborsClassifier(3, metric='minkowski'),
	"kNN Euclidean": KNeighborsClassifier(3, metric='euclidean'),
	"kNN Manhattan": KNeighborsClassifier(3, metric='manhattan'),
	"kNN Chebyshev": KNeighborsClassifier(3, metric='chebyshev')
	}

	###############################################################################
	# Plotter Function
	###############################################################################

	def linearly_separable_data():
	"""
	Produce a linearly separable dataset
	"""
	X, y = make_classification(
	n_features=2, n_redundant=0, n_informative=2,
	random_state=1, n_clusters_per_class=1
	)

	rng = np.random.RandomState(2)
	X += 2 * rng.uniform(size=X.shape)

	return (X, y)

	def compare_classifiers(classifiers, point_colors, colormap):
	"""
	Given a mapping of classifier names to scikit-learn estimators, as well
	as an optional colormap argument, produce a custom classifier comparison
	plot.
	"""
	datasets = [
	linearly_separable_data(),
	make_moons(noise=0.3, random_state=0),
	make_circles(noise=0.2, factor=0.5, random_state=1),

	]

	_ = plt.figure(figsize=(27, 9))

	plt_idx = 1

	# Iterate over datasets
	for dataset_idx, dataset in enumerate(datasets):

	# Preprocess data
	X, y = dataset
	X = StandardScaler().fit_transform(X)

	# Create train and test splits
	X_train, X_test, y_train, y_test = tts(
	X, y, test_size=.4, random_state=42
	)

	# Create the plot range
	x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
	y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
	xx, yy = np.meshgrid(
	np.arange(x_min, x_max, .02),
	np.arange(y_min, y_max, .02)
	)

	# Plot the data by itself
	ax = plt.subplot(len(datasets), len(classifiers) + 1, plt_idx)

	if dataset_idx == 0:
	ax.set_title("Input data")

	# Plot the training points
	ax.scatter(
	X_train[:, 0],
	X_train[:, 1],
	c=y_train,
	edgecolors='k',
	cmap=point_colors
	)

	# Plot the testing points
	ax.scatter(
	X_test[:, 0],
	X_test[:, 1],
	c=y_test,
	alpha=0.6,
	edgecolors='k',
	cmap=point_colors
	)
	ax.set_xlim(xx.min(), xx.max())
	ax.set_ylim(yy.min(), yy.max())
	ax.set_xticks(())
	ax.set_yticks(())
	plt_idx += 1

	# Iterate over classifiers
	for name, clf in classifiers.items():
	ax = plt.subplot(len(datasets), len(classifiers) + 1, plt_idx)
	clf.fit(X_train, y_train)
	score = clf.score(X_test, y_test)

	# Plot the decision boundary. For that, we will assign a color to each
	# point in the mesh [x_min, x_max]x[y_min, y_max].
	if hasattr(clf, "decision_function"):
	Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
	else:
	Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

	# Put the result into a color plot
	Z = Z.reshape(xx.shape)
	ax.contourf(xx, yy, Z, cmap=colormap, alpha=.8)

	# Plot the training points
	ax.scatter(
	X_train[:, 0],
	X_train[:, 1],
	c=y_train,
	edgecolors='k',
	cmap=point_colors
	)

	# Plot the testing points
	ax.scatter(
	X_test[:, 0],
	X_test[:, 1],
	c=y_test,
	alpha=0.6,
	edgecolors='k',
	cmap=point_colors
	)
	ax.set_xlim(xx.min(), xx.max())
	ax.set_ylim(yy.min(), yy.max())
	ax.set_xticks(())
	ax.set_yticks(())

	if dataset_idx == 0:
	ax.set_title(name)

	# Position the axis label text
	ax.text(
	xx.max() - .3,
	yy.min() + .3,
	('%.2f' % score).lstrip('0'),
	horizontalalignment='right',
	size=15
	)
	plt_idx += 1

	plt.tight_layout()
	plt.show()

	if __name__ == "__main__":
	from matplotlib.colors import ListedColormap

	# From https://matplotlib.org/2.0.2/examples/color/colormaps_reference.html
	decision_colors = plt.cm.viridis
	train_test_colors = ListedColormap(['#4d06a3','#8ca803'])

	compare_classifiers(neighbors, train_test_colors, decision_colors)