Skip to content

Instantly share code, notes, and snippets.

@jgrizou
Last active August 9, 2019 15:50
Show Gist options
  • Save jgrizou/6a8ab93c24a6deac9682ca4c1c778766 to your computer and use it in GitHub Desktop.
Save jgrizou/6a8ab93c24a6deac9682ca4c1c778766 to your computer and use it in GitHub Desktop.
Reproduction of classes_ bug in SVC classifier from sklearn
import numpy as np
from sklearn.svm import SVC
# same dataset
X = np.array([[0.29166667, 0.19366939],
[0.28611111, 0.34763765],
[0.26111111, 0.55716146],
[0.73333333, 0.2016059 ],
[0.70694444, 0.3952567 ],
[0.66666667, 0.61271701]])
# two different labelling
y0 = np.array([0, 0, 0, 1, 1, 1])
y1 = np.array([0, 0, 1, 0, 1, 1])
# use a seed for reproducibility
SEED = 0
# train a classifier for y0
np.random.seed(SEED)
clf0 = SVC(gamma='scale', kernel='rbf', probability=True)
clf0.fit(X, y0)
print('####')
print('Classifer 0 summary:')
print('True labels: {}'.format(y0))
print('Predited labels: {}'.format(clf0.predict(X)))
print('Classification accuracy: {}'.format(clf0.score(X, y0)))
print('--')
print('Classes: {}'.format(clf0.classes_))
print('Predited probabilitic labeling:')
print(clf0.predict_proba(X))
## here the classes ordering matches with the probabilistic prediction
# ####
# Classifer 0 summary:
# True labels: [0 0 0 1 1 1]
# Predited labels: [0 0 0 1 1 1]
# Classification accuracy: 1.0
# --
# Classes: [0 1]
# Predited probabilitic labeling:
# [[0.98456235 0.01543765]
# [0.98680646 0.01319354]
# [0.98189501 0.01810499]
# [0.0141697 0.9858303 ]
# [0.01153142 0.98846858]
# [0.02285266 0.97714734]]
# train a classifier for y1
np.random.seed(SEED)
clf1 = SVC(gamma='scale', kernel='rbf', probability=True)
clf1.fit(X, y1)
print('####')
print('Classifer 1 summary:')
print('True labels: {}'.format(y1))
print('Predited labels: {}'.format(clf1.predict(X)))
print('Classification accuracy: {}'.format(clf1.score(X, y1)))
print('--')
print('Classes: {}'.format(clf1.classes_))
print('Predited probabilitic labeling:')
print(clf1.predict_proba(X))
## here the classes ordering does not match with the probabilistic prediction
# ####
# Classifer 1 summary:
# True labels: [0 0 1 0 1 1]
# Predited labels: [0 0 1 0 1 1]
# Classification accuracy: 1.0
# --
# Classes: [0 1]
# Predited probabilitic labeling:
# [[0.0774986 0.9225014 ]
# [0.21423451 0.78576549]
# [0.61478601 0.38521399]
# [0.32195121 0.67804879]
# [0.69766737 0.30233263]
# [0.9182796 0.0817204 ]]
## for the same data, with different labels, same seed, the classifier returns probabilisitic prediction in a different order which is not reflected in clf.classes_.
## suggested temporary workaround
def get_ordered_classes(clf, X, y):
y_pred = clf.predict(X)
y_pred_proba = clf.predict_proba(X)
index_pred = np.argmax(y_pred_proba, axis=1)
ordered_classes = []
n_class = y_pred_proba.shape[1]
for class_index in range(n_class):
# check which data points have been classified as class_index by the probability predictor
X_indexes_for_class_index = np.where(index_pred == class_index)
# find what label it corresponds to from the simple predictor
class_name = y_pred[X_indexes_for_class_index]
# make sure this is consistent and a unique label is in that list
# a rare case coudl happen if probab are exaclty equal betwen all classes [0.5 0.5] but so unlikelly that I won't cover it
class_name = np.unique(class_name)
assert class_name.shape == (1,), 'Something might be wrong between predict and predict proba in SVC'
# add class to list
ordered_classes.append(class_name[0])
return np.array(ordered_classes)
print('#### classes for y0')
print('classes_ from SVC: {}'.format(clf0.classes_))
print('classes as used in predict_proba: {}'.format(get_ordered_classes(clf0, X, y0)))
# #### classes for y0
# classes_ from SVC: [0 1]
# classes as used in predict_proba: [0 1]
print('#### classes for y1')
print('classes_ from SVC: {}'.format(clf1.classes_))
print('classes used in predict_proba: {}'.format(get_ordered_classes(clf1, X, y1)))
# #### classes for y1
# classes_ from SVC: [0 1]
# classes used in predict_proba: [1 0]
## Solution using CalibratedClassifierCV
from sklearn.calibration import CalibratedClassifierCV
calibrator0 = CalibratedClassifierCV(clf0, method='sigmoid', cv='prefit')
calibrator0.fit(X, y0)
print('#### classes for y0')
print('classes_ from SVC: {}'.format(clf0.classes_))
print('classes as used in predict_proba: {}'.format(get_ordered_classes(clf0, X, y0)))
print('classes as used in Calibrator: {}'.format(get_ordered_classes(calibrator0, X, y0)))
# #### classes for y0
# classes_ from SVC: [0 1]
# classes as used in predict_proba: [0 1]
# classes as used in Calibrator: [0 1]
calibrator1 = CalibratedClassifierCV(clf1, method='sigmoid', cv='prefit')
calibrator1.fit(X, y1)
print('#### classes for y1')
print('classes_ from SVC: {}'.format(clf1.classes_))
print('classes used in predict_proba: {}'.format(get_ordered_classes(clf1, X, y1)))
print('classes as used in Calibrator: {}'.format(get_ordered_classes(calibrator1, X, y1)))
# #### classes for y1
# classes_ from SVC: [0 1]
# classes used in predict_proba: [1 0]
# classes as used in Calibrator: [0 1]
###
#reproduction of bug with CalibratedClassifierCV due to the crossvalidation procedure as deatilled in https://github.com/scikit-learn/scikit-learn/issues/13211#issuecomment-518797874
from sklearn.model_selection import KFold
# reproduce a bad classification to to the cv procedure
calibrator1 = CalibratedClassifierCV(clf1, method='sigmoid', cv=KFold(n_splits=5, shuffle=True, random_state=3))
calibrator1.fit(X, y1)
print('#### info for CalibratedClassifierCV with Kfold(5)')
print('True labels: {}'.format(y1))
print('Pred labels: {}'.format(calibrator1.predict(X)))
print('Score: {}'.format(calibrator1.score(X, y1)))
print('Pred probas:')
print(calibrator1.predict_proba(X))
print('classes_ from SVC: {}'.format(clf1.classes_))
print('classes_ from CalibratedClassifierCV: {}'.format(calibrator1.classes_))
# but the classes are still correctly ordered
print('Classes as used in Calibrator with Kfold(5): {}'.format(get_ordered_classes(calibrator1, X, y1)))
# #### info for CalibratedClassifierCV with Kfold(5)
# True labels: [0 0 1 0 1 1]
# Pred labels: [0 0 0 0 1 1]
# Score: 0.8333333333333334
# Pred probas:
# [[0.59647343 0.40352657]
# [0.59110486 0.40889514]
# [0.56214059 0.43785941]
# [0.53333338 0.46666662]
# [0.49341504 0.50658496]
# [0.46666686 0.53333314]]
# classes_ from SVC: [0 1]
# classes_ from CalibratedClassifierCV: [0 1]
# Classes as used in Calibrator with Kfold(5): [0 1]
# trying to generate a reverse ordering
print('#### Trying to generate a reverse ordering...')
classe_orderings = []
accuracies = []
for random_state in range(10000):
calibrator1 = CalibratedClassifierCV(clf1, method='sigmoid', cv=KFold(n_splits=5, shuffle=True, random_state=random_state))
calibrator1.fit(X, y1)
try:
# the function breaks when all prediction are of the same class, those cases don't not really matter for this little experiment
# we seek to find one case where the prediction are reversed as for clf1 with probability=True
classe_orderings.append(get_ordered_classes(calibrator1, X, y1))
accuracies.append(calibrator1.score(X, y1))
except:
pass
uniqueValues, occurCount = np.unique(classe_orderings, return_counts=True, axis=0)
print('Unique classe ordering:')
# Iterate over the zip object
for v, o in zip(uniqueValues, occurCount):
print('{} occurs : {}/{} times'.format(v, o, len(classe_orderings)))
#### Trying to generate a reverse ordering...
# Unique classe ordering:
# [0 1] occurs : 5955/5955 times
# The output is always [0 1] never [1 0] as SVC produces sometimes
uniqueValues, occurCount = np.unique(accuracies, return_counts=True, axis=0)
print('Unique accuracies:')
# Iterate over the zip object
for v, o in zip(uniqueValues, occurCount):
print('{} occurs : {}/{} times'.format(v, o, len(accuracies)))
# Unique accuracies:
# 0.3333333333333333 occurs : 1294/5955 times
# 0.8333333333333334 occurs : 4661/5955 times
# There is some reverse fitting (score of 0.33) but the labels association is still ok
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment