Skip to content

Instantly share code, notes, and snippets.

@leferrad
Last active March 22, 2020 18:08
Show Gist options
  • Save leferrad/be851a1969bf94a82a772c4f309f9885 to your computer and use it in GitHub Desktop.
Save leferrad/be851a1969bf94a82a772c4f309f9885 to your computer and use it in GitHub Desktop.
Standard metrics for both classification and regression problems in supervised learning
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Metrics for measuring the performance of a model in a given dataset,
in both kinds of problems: *classification* and *regression*.
Implementation extracted from: https://github.com/leferrad/learninspy/blob/master/learninspy/utils/evaluation.py
NOTE: Only working with Python 2.7.x
"""
__author__ = 'leferrad'
import numpy as np
class ClassificationMetrics(object):
"""
Metrics to evaluate the performance of a model in terms of classification.
Based on the metrics list presented in the publication of Sokolova et.al. [sokolova2009systematic]_.
:param predicted_actual: list of tuples (predicted, actual)
:param n_classes: int, number of classes handled in the classification task
>>> predict = [0, 1, 0, 2, 2, 1]
>>> labels = [0, 1, 1, 2, 1, 0]
>>> metrics = ClassificationMetrics(zip(predict, labels), 3)
>>> metrics.measures.keys()
['Recall', 'F-measure', 'Precision', 'Accuracy']
>>> metrics.accuracy()
0.5
>>> metrics.f_measure()
0.5499999999999999
>>> metrics.precision()
0.5
>>> metrics.evaluate('Recall')
0.611111111111111
>>> metrics.confusion_matrix()
array([[1, 1, 0],
[1, 1, 1],
[0, 0, 1]])
**References**:
.. [sokolova2009systematic] Sokolova, M., & Lapalme, G. (2009).
A systematic analysis of performance measures for classification tasks.
Information Processing & Management, 45(4), 427-437.
"""
# See http://machine-learning.tumblr.com/post/1209400132/mathematical-definitions-for-precisionrecall-for
# See http://rali.iro.umontreal.ca/rali/sites/default/files/publis/SokolovaLapalme-JIPM09.pdf
def __init__(self, predicted_actual, n_classes):
self.predicted_actual = predicted_actual
self.tp = []
self.fp = []
self.fn = []
for c in xrange(n_classes):
self.tp.append(sum(map(lambda (p, a): p == c and a == c, predicted_actual)))
self.fp.append(sum(map(lambda (p, a): p == c and a != c, predicted_actual)))
self.fn.append(sum(map(lambda (p, a): p != c and a == c, predicted_actual)))
self.n_classes = n_classes
self.n_elem = len(predicted_actual)
self.measures = {'F-measure': self.f_measure, 'Accuracy': self.accuracy,
'Precision': self.precision, 'Recall': self.recall}
def accuracy(self, label=None):
"""
Calculates the accuracy of classification, as the rate of hits over the total.
Given the number of classes 'C', the formula for this value is:
:math:`ACC=\\dfrac{1}{C}\displaystyle\sum\limits_{i=0}^{C-1} \\frac{TP_i + TN_i}{TP_i+FN_i+FP_i+TN_i}`
:param label: int in {0, C - 1} to indicate which class to evaluate. If *None* then it evaluates over all classes.
:return: float, varying from 0 (worst) to 1 (best).
"""
if label is None:
acc = sum(map(lambda (pre, act): pre == act, self.predicted_actual)) / float(self.n_elem)
else:
acc = sum(map(lambda (pre, act): pre == act == label, self.predicted_actual)) / \
float(sum([act == label for _, act in self.predicted_actual]))
return acc
def precision(self, label=None, macro=True):
"""
Calculates the precision of classification, as the amount of **true positives**
(i.e. number of items that are correctly classified) divided by the total of elements classified
for a given class (i.e. sum of true positives and **false positives**, that are the items
incorrectly classified as the given class). This is expressed in the following equation:
:math:`P_i=\\dfrac{TP_i}{TP_i+FP_i}`
Given the number of classes 'C', the equations for micro- and macro-averaging are:
:math:`P_{\\mu}=\\dfrac{\sum_{i=0}^{C-1} TP_i}{\sum_i TP_i+FP_i}, \quad
P_{M}=\\dfrac{1}{C}\displaystyle\sum\limits_{i=0}^{C-1} \\frac{TP_i}{TP_i+FP_i}`
:param label: int in {0, C - 1} to indicate which class to evaluate. If *None* then it evaluates over all classes.
:param macro: bool, that indicates how to calculate this value for all the classes
(True for *macro* and False for *micro).
:return: float, varying from 0 (worst) to 1 (best).
"""
if label is None:
if macro is True:
p = sum([self.precision(c) for c in xrange(self.n_classes)])
p /= float(self.n_classes)
else:
p = sum(self.tp) / float(sum(map(lambda (tp, fp): tp + fp, zip(self.tp, self.fp))))
else:
if self.tp[label] == 0.0 and self.fp[label] == 0.0:
p = 1.0
else:
p = self.tp[label] / float(self.tp[label] + self.fp[label])
return p
def recall(self, label=None, macro=True):
"""
Calculates the recall of classification, as the amount of **true positives*
(i.e. number of items that are correctly classified) divided by the total of elements that
belong to the given class (i.e. sum of true positives and **false negatives**, that are the
items incorrectly not classified as the given class). This is expressed in the following equation:
:math:`R_i=\\dfrac{TP_i}{TP_i+FN_i}`
Given the number of classes 'C', the equations for micro- and macro-averaging are:
:math:`R_{\\mu}=\\dfrac{\sum_{i=0}^{C-1} TP_i}{\sum_i TP_i+FN_i}, \quad
R_{M}=\\dfrac{1}{C}\displaystyle\sum\limits_{i=0}^{C-1} \\frac{TP_i}{TP_i+FN_i}`
:param label: int in {0, C - 1} to indicate which class to evaluate. If *None* then it evaluates over all classes.
:param macro: bool, that indicates how to calculate this value for all the classes
(True for *macro* and False for *micro).
:return: float, varying from 0 (worst) to 1 (best).
"""
if label is None:
if macro is True:
r = sum([self.recall(c) for c in xrange(self.n_classes)])
r /= float(self.n_classes)
else:
r = sum(self.tp) / float(sum(map(lambda (tp, fn): tp + fn, zip(self.tp, self.fn))))
else:
if self.tp[label] == 0.0 and self.fn[label] == 0.0:
r = 1.0
else:
r = self.tp[label] / float(self.tp[label] + self.fn[label])
return r
def f_measure(self, beta=1, label=None, macro=True):
"""
Calcula el *F-measure* de la clasificación, el cual combina las medidas de *precision* y *recall* mediante
una media armónica de ambos. Dicho balance es ajustado por un parámetro :math:`\\beta`, y un caso
muy utilizado de esta medida es el *F1-score* donde se pondera igual a ambas medidas con :math:`\\beta = 1`.
:math:`F(\\beta)=(1+\\beta)(\\dfrac{PR}{\\beta^2 P + R}), \quad F_1=\\dfrac{2PR}{P + R}`
Given the number of classes 'C', the equations for micro- and macro-averaging are:
:math:`F_{\\mu}(\\beta)=(1+\\beta)(\\dfrac{P_{\\mu}R_{\\mu}}{\\beta^2 P_{\\mu} + R_{\\mu}}), \quad
F_{M}(\\beta)=(1+\\beta)(\\dfrac{P_{M}R_{M}}{\\beta^2 P_{M} + R_{M}})`
:param beta: float, parameter :math:`\\beta` which indicates the balance between *precision* and *recall*.
If :math:`\\beta < 1` then the *precision* is weighted, and with :math:`\\beta > 1` the *recall* is favored..
:param label: int in {0, C - 1} to indicate which class to evaluate. If *None* then it evaluates over all classes.
:param macro: bool, that indicates how to calculate this value for all the classes
(True for *macro* and False for *micro).
:return: float, varying from 0 (worst) to 1 (best).
"""
ppv = self.precision(label, macro)
tpr = self.recall(label, macro)
if ppv == 0 and tpr == 0:
f_score = 0.0
else:
f_score = (1 + beta*beta)*(ppv * tpr) / (beta*beta*ppv + tpr)
return f_score
def confusion_matrix(self):
"""
Confusion matrix resulting, where the columns correspond to *predicted* values
that are ordered in ascending way for each *actual* class.
:return: numpy.ndarray
"""
conf_mat = []
for r in xrange(self.n_classes):
pre_act = filter(lambda (p, a): a == r, self.predicted_actual)
for c in xrange(self.n_classes):
conf_mat.append(sum(map(lambda (p, a): p == c, pre_act)))
return np.array(conf_mat).reshape((self.n_classes, self.n_classes))
def evaluate(self, measure='F-measure', **kwargs):
"""
Applies some of the implemented metrics, that are registered in the dict *self.measures*.
This function becomes useful as a wrapper to easily test every metric desired.
:param measure: string, key of an implemented metric.
:param kwargs: dict, it can include some other parameters for the metric to be used
(e.g. *beta* for *F-measure*, or *micro / macro* for those ones that support it).
:return: float
"""
assert measure in self.measures.keys(), ValueError("Measure %s doesn't belong to the supported ones: %s",
str(measure), str(self.measures.keys()))
return self.measures[measure](**kwargs)
class RegressionMetrics(object):
"""
Metrics to evaluate the performance of a model in terms of regression.
:param predicted_actual: list of tuples (predicted, actual)
>>> predict = [0.5, 1.1, 1.5, 2.0, 3.5, 5.2]
>>> labels = [0.5, 1.0, 2.0, 3.0, 4.0, 5.0]
>>> metrics = RegressionMetrics(zip(predict, labels))
>>> metrics.measures.keys()
['ExplVar', 'MSE', 'MAE', 'R2', 'RMSE']
>>> metrics.mae()
2.3000000000000003
>>> metrics.mse()
0.25833333333333336
>>> metrics.evaluate('RMSE')
0.50826502273256358
>>> metrics.r2()
0.8980821917808219
>>> metrics.explained_variance()
0.9297534246575342
"""
def __init__(self, predicted_actual):
self.predicted_actual = predicted_actual
self.n_elem = len(predicted_actual)
self.error = map(lambda (p, a): a - p, self.predicted_actual)
self.measures = {'MSE': self.mse, 'RMSE': self.rmse, 'MAE': self.mae,
'R2': self.r2, 'ExplVar': self.explained_variance}
def mse(self):
"""
Calculates the *Mean Squared Error* (MSE), defined as the sum of the squares of the differences
betweem the actual value and the predicted one for each of the *N* samples:
:math:`MSE=\\dfrac{1}{N}\displaystyle\sum\limits_{i}^N (p_i - a_i)^2`
:return: float, varying from 0 (best) and inf (worst).
"""
return np.mean(np.square(self.error))
def rmse(self):
"""
Returns the square root of the MSE, which is useful to not depend on a scale while comparing
performance among several models.
:math:`RMSE=\\sqrt{MSE}`
:return: float, varying from 0 (best) and inf (worst).
"""
return np.sqrt(self.mse())
def mae(self):
"""
Calculates the *Mean Absolute Error* (MAE), defined as the sum over the absolute differences between
the actual value and the predicted one for each of the *N* samples:
:math:`MAE=\\dfrac{1}{N}\displaystyle\sum\limits_{i}^N |p_i - a_i|`
:return: float, varying from 0 (best) and inf (worst).
"""
return np.mean(np.abs(self.error))
def rmae(self):
"""
Returns the square root of the MAE, which is useful to not depend on a scale while comparing
performance among several models.
:math:`RMAE=\\sqrt{MAE}`
:return: float, varying from 0 (best) and inf (worst).
"""
return np.sqrt(self.mae())
def r2(self):
"""
Calculates the coefficient of determination or R^2, which indicates the proportion of the
variance of the *actual* values that are explained by the *predicted* values.
See more info in Wikipedia: `Coefficient of determination
<https://en.wikipedia.org/wiki/Coefficient_of_determination>`_.
:return: float, varying from 0 (worst) and 1 (best).
"""
mean_actual = np.mean(map(lambda (p, a): a, self.predicted_actual))
ssres = np.sum(np.square(self.error))
sstot = np.sum(np.square(map(lambda (p, a): a - mean_actual, self.predicted_actual)))
return 1 - float(ssres / sstot)
def explained_variance(self):
"""
Calculates the explained variance in the prediction over the actual values, such that:
:math:`ExpVar=1-\\dfrac{Var(actual - predicted)}{Var(actual)}`
:return: float, varying from 0 (worst) and 1 (best).
"""
var_error = np.var(self.error)
var_actual = np.var(map(lambda (p, a): a, self.predicted_actual))
return 1 - float(var_error / var_actual)
def evaluate(self, measure='R2'):
"""
Applies some of the implemented metrics, that are registered in the dict *self.measures*.
This function becomes useful as a wrapper to easily test every metric desired.
:param measure: string, key of an implemented metric.
:return: float
"""
assert measure in self.measures.keys(), ValueError("Measure %s doesn't belong to the supported ones: %s",
str(measure), str(self.measures.keys()))
return self.measures[measure]()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment