shark8me/gbrt_calibration.py

## gbrt_calibration.py
print(__doc__)

# Author: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD Style.

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.cross_validation import train_test_split
from sklearn import ensemble


# Create dataset of classification task with many redundant and few
# informative features
X, y = datasets.make_classification(n_samples=100000, n_features=20,
                                    n_informative=2, n_redundant=10,
                                    random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99,
                                                    random_state=42)


def plot_calibration_curve(est, name, fig_index):
    """Plot calibration curve for est w/o and with calibration. """
    # Calibrated with isotonic calibration
    isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic')

    # Calibrated with sigmoid calibration
    sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid')

    # Logistic regression with no calibration as baseline
    lr = LogisticRegression(C=1., solver='lbfgs')

    fig = plt.figure(fig_index, figsize=(10, 10))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    for clf, name in [(lr, 'Logistic'),
                      (est, name),
                      (isotonic, name + ' + Isotonic'),
                      (sigmoid, name + ' + Sigmoid')]:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        if hasattr(clf, "predict_proba"):
            prob_pos = clf.predict_proba(X_test)[:, 1]
        else:  # use decision function
            prob_pos = clf.decision_function(X_test)
            prob_pos = \
                (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())

        clf_score = brier_score_loss(y_test, prob_pos, pos_label=y.max())
        print("%s:" % name)
        print("\tBrier: %1.3f" % (clf_score))
        print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
        print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
        print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))

        fraction_of_positives, mean_predicted_value = \
            calibration_curve(y_test, prob_pos, n_bins=10)

        ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
                 label="%s (%1.3f)" % (name, clf_score))

        ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
                 histtype="step", lw=2)

    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title('Calibration plots  (reliability curve)')

    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    ax2.legend(loc="upper center", ncol=2)

    plt.tight_layout()

# Plot calibration cuve for Gaussian Naive Bayes
plot_calibration_curve(GaussianNB(), "Naive Bayes", 1)

# Plot calibration cuve for Linear SVC
plot_calibration_curve(LinearSVC(), "SVC", 2)

# Plot calibration cuve for GBRT
plot_calibration_curve(ensemble.GradientBoostingClassifier(), "GBRT", 3)

# Plot calibration cuve for RF
plot_calibration_curve(ensemble.RandomForestClassifier(), "RF", 4)

plt.show()
	print(__doc__)

	# Author: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
	# Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
	# License: BSD Style.

	%matplotlib inline
	import matplotlib.pyplot as plt

	from sklearn import datasets
	from sklearn.naive_bayes import GaussianNB
	from sklearn.svm import LinearSVC
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
	f1_score)
	from sklearn.calibration import CalibratedClassifierCV, calibration_curve
	from sklearn.cross_validation import train_test_split
	from sklearn import ensemble


	# Create dataset of classification task with many redundant and few
	# informative features
	X, y = datasets.make_classification(n_samples=100000, n_features=20,
	n_informative=2, n_redundant=10,
	random_state=42)

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99,
	random_state=42)


	def plot_calibration_curve(est, name, fig_index):
	"""Plot calibration curve for est w/o and with calibration. """
	# Calibrated with isotonic calibration
	isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic')

	# Calibrated with sigmoid calibration
	sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid')

	# Logistic regression with no calibration as baseline
	lr = LogisticRegression(C=1., solver='lbfgs')

	fig = plt.figure(fig_index, figsize=(10, 10))
	ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
	ax2 = plt.subplot2grid((3, 1), (2, 0))

	ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
	for clf, name in [(lr, 'Logistic'),
	(est, name),
	(isotonic, name + ' + Isotonic'),
	(sigmoid, name + ' + Sigmoid')]:
	clf.fit(X_train, y_train)
	y_pred = clf.predict(X_test)
	if hasattr(clf, "predict_proba"):
	prob_pos = clf.predict_proba(X_test)[:, 1]
	else: # use decision function
	prob_pos = clf.decision_function(X_test)
	prob_pos = \
	(prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())

	clf_score = brier_score_loss(y_test, prob_pos, pos_label=y.max())
	print("%s:" % name)
	print("\tBrier: %1.3f" % (clf_score))
	print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
	print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
	print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))

	fraction_of_positives, mean_predicted_value = \
	calibration_curve(y_test, prob_pos, n_bins=10)

	ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
	label="%s (%1.3f)" % (name, clf_score))

	ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
	histtype="step", lw=2)

	ax1.set_ylabel("Fraction of positives")
	ax1.set_ylim([-0.05, 1.05])
	ax1.legend(loc="lower right")
	ax1.set_title('Calibration plots (reliability curve)')

	ax2.set_xlabel("Mean predicted value")
	ax2.set_ylabel("Count")
	ax2.legend(loc="upper center", ncol=2)

	plt.tight_layout()

	# Plot calibration cuve for Gaussian Naive Bayes
	plot_calibration_curve(GaussianNB(), "Naive Bayes", 1)

	# Plot calibration cuve for Linear SVC
	plot_calibration_curve(LinearSVC(), "SVC", 2)

	# Plot calibration cuve for GBRT
	plot_calibration_curve(ensemble.GradientBoostingClassifier(), "GBRT", 3)

	# Plot calibration cuve for RF
	plot_calibration_curve(ensemble.RandomForestClassifier(), "RF", 4)

	plt.show()