Last active
March 22, 2020 18:08
-
-
Save leferrad/be851a1969bf94a82a772c4f309f9885 to your computer and use it in GitHub Desktop.
Standard metrics for both classification and regression problems in supervised learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Metrics for measuring the performance of a model in a given dataset, | |
in both kinds of problems: *classification* and *regression*. | |
Implementation extracted from: https://github.com/leferrad/learninspy/blob/master/learninspy/utils/evaluation.py | |
NOTE: Only working with Python 2.7.x | |
""" | |
__author__ = 'leferrad' | |
import numpy as np | |
class ClassificationMetrics(object): | |
""" | |
Metrics to evaluate the performance of a model in terms of classification. | |
Based on the metrics list presented in the publication of Sokolova et.al. [sokolova2009systematic]_. | |
:param predicted_actual: list of tuples (predicted, actual) | |
:param n_classes: int, number of classes handled in the classification task | |
>>> predict = [0, 1, 0, 2, 2, 1] | |
>>> labels = [0, 1, 1, 2, 1, 0] | |
>>> metrics = ClassificationMetrics(zip(predict, labels), 3) | |
>>> metrics.measures.keys() | |
['Recall', 'F-measure', 'Precision', 'Accuracy'] | |
>>> metrics.accuracy() | |
0.5 | |
>>> metrics.f_measure() | |
0.5499999999999999 | |
>>> metrics.precision() | |
0.5 | |
>>> metrics.evaluate('Recall') | |
0.611111111111111 | |
>>> metrics.confusion_matrix() | |
array([[1, 1, 0], | |
[1, 1, 1], | |
[0, 0, 1]]) | |
**References**: | |
.. [sokolova2009systematic] Sokolova, M., & Lapalme, G. (2009). | |
A systematic analysis of performance measures for classification tasks. | |
Information Processing & Management, 45(4), 427-437. | |
""" | |
# See http://machine-learning.tumblr.com/post/1209400132/mathematical-definitions-for-precisionrecall-for | |
# See http://rali.iro.umontreal.ca/rali/sites/default/files/publis/SokolovaLapalme-JIPM09.pdf | |
def __init__(self, predicted_actual, n_classes): | |
self.predicted_actual = predicted_actual | |
self.tp = [] | |
self.fp = [] | |
self.fn = [] | |
for c in xrange(n_classes): | |
self.tp.append(sum(map(lambda (p, a): p == c and a == c, predicted_actual))) | |
self.fp.append(sum(map(lambda (p, a): p == c and a != c, predicted_actual))) | |
self.fn.append(sum(map(lambda (p, a): p != c and a == c, predicted_actual))) | |
self.n_classes = n_classes | |
self.n_elem = len(predicted_actual) | |
self.measures = {'F-measure': self.f_measure, 'Accuracy': self.accuracy, | |
'Precision': self.precision, 'Recall': self.recall} | |
def accuracy(self, label=None): | |
""" | |
Calculates the accuracy of classification, as the rate of hits over the total. | |
Given the number of classes 'C', the formula for this value is: | |
:math:`ACC=\\dfrac{1}{C}\displaystyle\sum\limits_{i=0}^{C-1} \\frac{TP_i + TN_i}{TP_i+FN_i+FP_i+TN_i}` | |
:param label: int in {0, C - 1} to indicate which class to evaluate. If *None* then it evaluates over all classes. | |
:return: float, varying from 0 (worst) to 1 (best). | |
""" | |
if label is None: | |
acc = sum(map(lambda (pre, act): pre == act, self.predicted_actual)) / float(self.n_elem) | |
else: | |
acc = sum(map(lambda (pre, act): pre == act == label, self.predicted_actual)) / \ | |
float(sum([act == label for _, act in self.predicted_actual])) | |
return acc | |
def precision(self, label=None, macro=True): | |
""" | |
Calculates the precision of classification, as the amount of **true positives** | |
(i.e. number of items that are correctly classified) divided by the total of elements classified | |
for a given class (i.e. sum of true positives and **false positives**, that are the items | |
incorrectly classified as the given class). This is expressed in the following equation: | |
:math:`P_i=\\dfrac{TP_i}{TP_i+FP_i}` | |
Given the number of classes 'C', the equations for micro- and macro-averaging are: | |
:math:`P_{\\mu}=\\dfrac{\sum_{i=0}^{C-1} TP_i}{\sum_i TP_i+FP_i}, \quad | |
P_{M}=\\dfrac{1}{C}\displaystyle\sum\limits_{i=0}^{C-1} \\frac{TP_i}{TP_i+FP_i}` | |
:param label: int in {0, C - 1} to indicate which class to evaluate. If *None* then it evaluates over all classes. | |
:param macro: bool, that indicates how to calculate this value for all the classes | |
(True for *macro* and False for *micro). | |
:return: float, varying from 0 (worst) to 1 (best). | |
""" | |
if label is None: | |
if macro is True: | |
p = sum([self.precision(c) for c in xrange(self.n_classes)]) | |
p /= float(self.n_classes) | |
else: | |
p = sum(self.tp) / float(sum(map(lambda (tp, fp): tp + fp, zip(self.tp, self.fp)))) | |
else: | |
if self.tp[label] == 0.0 and self.fp[label] == 0.0: | |
p = 1.0 | |
else: | |
p = self.tp[label] / float(self.tp[label] + self.fp[label]) | |
return p | |
def recall(self, label=None, macro=True): | |
""" | |
Calculates the recall of classification, as the amount of **true positives* | |
(i.e. number of items that are correctly classified) divided by the total of elements that | |
belong to the given class (i.e. sum of true positives and **false negatives**, that are the | |
items incorrectly not classified as the given class). This is expressed in the following equation: | |
:math:`R_i=\\dfrac{TP_i}{TP_i+FN_i}` | |
Given the number of classes 'C', the equations for micro- and macro-averaging are: | |
:math:`R_{\\mu}=\\dfrac{\sum_{i=0}^{C-1} TP_i}{\sum_i TP_i+FN_i}, \quad | |
R_{M}=\\dfrac{1}{C}\displaystyle\sum\limits_{i=0}^{C-1} \\frac{TP_i}{TP_i+FN_i}` | |
:param label: int in {0, C - 1} to indicate which class to evaluate. If *None* then it evaluates over all classes. | |
:param macro: bool, that indicates how to calculate this value for all the classes | |
(True for *macro* and False for *micro). | |
:return: float, varying from 0 (worst) to 1 (best). | |
""" | |
if label is None: | |
if macro is True: | |
r = sum([self.recall(c) for c in xrange(self.n_classes)]) | |
r /= float(self.n_classes) | |
else: | |
r = sum(self.tp) / float(sum(map(lambda (tp, fn): tp + fn, zip(self.tp, self.fn)))) | |
else: | |
if self.tp[label] == 0.0 and self.fn[label] == 0.0: | |
r = 1.0 | |
else: | |
r = self.tp[label] / float(self.tp[label] + self.fn[label]) | |
return r | |
def f_measure(self, beta=1, label=None, macro=True): | |
""" | |
Calcula el *F-measure* de la clasificación, el cual combina las medidas de *precision* y *recall* mediante | |
una media armónica de ambos. Dicho balance es ajustado por un parámetro :math:`\\beta`, y un caso | |
muy utilizado de esta medida es el *F1-score* donde se pondera igual a ambas medidas con :math:`\\beta = 1`. | |
:math:`F(\\beta)=(1+\\beta)(\\dfrac{PR}{\\beta^2 P + R}), \quad F_1=\\dfrac{2PR}{P + R}` | |
Given the number of classes 'C', the equations for micro- and macro-averaging are: | |
:math:`F_{\\mu}(\\beta)=(1+\\beta)(\\dfrac{P_{\\mu}R_{\\mu}}{\\beta^2 P_{\\mu} + R_{\\mu}}), \quad | |
F_{M}(\\beta)=(1+\\beta)(\\dfrac{P_{M}R_{M}}{\\beta^2 P_{M} + R_{M}})` | |
:param beta: float, parameter :math:`\\beta` which indicates the balance between *precision* and *recall*. | |
If :math:`\\beta < 1` then the *precision* is weighted, and with :math:`\\beta > 1` the *recall* is favored.. | |
:param label: int in {0, C - 1} to indicate which class to evaluate. If *None* then it evaluates over all classes. | |
:param macro: bool, that indicates how to calculate this value for all the classes | |
(True for *macro* and False for *micro). | |
:return: float, varying from 0 (worst) to 1 (best). | |
""" | |
ppv = self.precision(label, macro) | |
tpr = self.recall(label, macro) | |
if ppv == 0 and tpr == 0: | |
f_score = 0.0 | |
else: | |
f_score = (1 + beta*beta)*(ppv * tpr) / (beta*beta*ppv + tpr) | |
return f_score | |
def confusion_matrix(self): | |
""" | |
Confusion matrix resulting, where the columns correspond to *predicted* values | |
that are ordered in ascending way for each *actual* class. | |
:return: numpy.ndarray | |
""" | |
conf_mat = [] | |
for r in xrange(self.n_classes): | |
pre_act = filter(lambda (p, a): a == r, self.predicted_actual) | |
for c in xrange(self.n_classes): | |
conf_mat.append(sum(map(lambda (p, a): p == c, pre_act))) | |
return np.array(conf_mat).reshape((self.n_classes, self.n_classes)) | |
def evaluate(self, measure='F-measure', **kwargs): | |
""" | |
Applies some of the implemented metrics, that are registered in the dict *self.measures*. | |
This function becomes useful as a wrapper to easily test every metric desired. | |
:param measure: string, key of an implemented metric. | |
:param kwargs: dict, it can include some other parameters for the metric to be used | |
(e.g. *beta* for *F-measure*, or *micro / macro* for those ones that support it). | |
:return: float | |
""" | |
assert measure in self.measures.keys(), ValueError("Measure %s doesn't belong to the supported ones: %s", | |
str(measure), str(self.measures.keys())) | |
return self.measures[measure](**kwargs) | |
class RegressionMetrics(object): | |
""" | |
Metrics to evaluate the performance of a model in terms of regression. | |
:param predicted_actual: list of tuples (predicted, actual) | |
>>> predict = [0.5, 1.1, 1.5, 2.0, 3.5, 5.2] | |
>>> labels = [0.5, 1.0, 2.0, 3.0, 4.0, 5.0] | |
>>> metrics = RegressionMetrics(zip(predict, labels)) | |
>>> metrics.measures.keys() | |
['ExplVar', 'MSE', 'MAE', 'R2', 'RMSE'] | |
>>> metrics.mae() | |
2.3000000000000003 | |
>>> metrics.mse() | |
0.25833333333333336 | |
>>> metrics.evaluate('RMSE') | |
0.50826502273256358 | |
>>> metrics.r2() | |
0.8980821917808219 | |
>>> metrics.explained_variance() | |
0.9297534246575342 | |
""" | |
def __init__(self, predicted_actual): | |
self.predicted_actual = predicted_actual | |
self.n_elem = len(predicted_actual) | |
self.error = map(lambda (p, a): a - p, self.predicted_actual) | |
self.measures = {'MSE': self.mse, 'RMSE': self.rmse, 'MAE': self.mae, | |
'R2': self.r2, 'ExplVar': self.explained_variance} | |
def mse(self): | |
""" | |
Calculates the *Mean Squared Error* (MSE), defined as the sum of the squares of the differences | |
betweem the actual value and the predicted one for each of the *N* samples: | |
:math:`MSE=\\dfrac{1}{N}\displaystyle\sum\limits_{i}^N (p_i - a_i)^2` | |
:return: float, varying from 0 (best) and inf (worst). | |
""" | |
return np.mean(np.square(self.error)) | |
def rmse(self): | |
""" | |
Returns the square root of the MSE, which is useful to not depend on a scale while comparing | |
performance among several models. | |
:math:`RMSE=\\sqrt{MSE}` | |
:return: float, varying from 0 (best) and inf (worst). | |
""" | |
return np.sqrt(self.mse()) | |
def mae(self): | |
""" | |
Calculates the *Mean Absolute Error* (MAE), defined as the sum over the absolute differences between | |
the actual value and the predicted one for each of the *N* samples: | |
:math:`MAE=\\dfrac{1}{N}\displaystyle\sum\limits_{i}^N |p_i - a_i|` | |
:return: float, varying from 0 (best) and inf (worst). | |
""" | |
return np.mean(np.abs(self.error)) | |
def rmae(self): | |
""" | |
Returns the square root of the MAE, which is useful to not depend on a scale while comparing | |
performance among several models. | |
:math:`RMAE=\\sqrt{MAE}` | |
:return: float, varying from 0 (best) and inf (worst). | |
""" | |
return np.sqrt(self.mae()) | |
def r2(self): | |
""" | |
Calculates the coefficient of determination or R^2, which indicates the proportion of the | |
variance of the *actual* values that are explained by the *predicted* values. | |
See more info in Wikipedia: `Coefficient of determination | |
<https://en.wikipedia.org/wiki/Coefficient_of_determination>`_. | |
:return: float, varying from 0 (worst) and 1 (best). | |
""" | |
mean_actual = np.mean(map(lambda (p, a): a, self.predicted_actual)) | |
ssres = np.sum(np.square(self.error)) | |
sstot = np.sum(np.square(map(lambda (p, a): a - mean_actual, self.predicted_actual))) | |
return 1 - float(ssres / sstot) | |
def explained_variance(self): | |
""" | |
Calculates the explained variance in the prediction over the actual values, such that: | |
:math:`ExpVar=1-\\dfrac{Var(actual - predicted)}{Var(actual)}` | |
:return: float, varying from 0 (worst) and 1 (best). | |
""" | |
var_error = np.var(self.error) | |
var_actual = np.var(map(lambda (p, a): a, self.predicted_actual)) | |
return 1 - float(var_error / var_actual) | |
def evaluate(self, measure='R2'): | |
""" | |
Applies some of the implemented metrics, that are registered in the dict *self.measures*. | |
This function becomes useful as a wrapper to easily test every metric desired. | |
:param measure: string, key of an implemented metric. | |
:return: float | |
""" | |
assert measure in self.measures.keys(), ValueError("Measure %s doesn't belong to the supported ones: %s", | |
str(measure), str(self.measures.keys())) | |
return self.measures[measure]() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment