Skip to content

Instantly share code, notes, and snippets.

@Kingkha
Created November 20, 2019 08:38
Show Gist options
  • Save Kingkha/be61a3fde837a6a208d6dae5d8ea1709 to your computer and use it in GitHub Desktop.
Save Kingkha/be61a3fde837a6a208d6dae5d8ea1709 to your computer and use it in GitHub Desktop.
calculate roc_auc_score from two numpy ndarray
# Source: http://ethen8181.github.io/machine-learning/model_selection/auc/auc.html#Implementation
def _binary_clf_curve(y_true, y_score):
"""
Calculate true and false positives per binary classification
threshold (can be used for roc curve or precision/recall curve);
the calcuation makes the assumption that the positive case
will always be labeled as 1
Parameters
----------
y_true : 1d ndarray, shape = [n_samples]
True targets/labels of binary classification
y_score : 1d ndarray, shape = [n_samples]
Estimated probabilities or scores
Returns
-------
tps : 1d ndarray
True positives counts, index i records the number
of positive samples that got assigned a
score >= thresholds[i].
The total number of positive samples is equal to
tps[-1] (thus false negatives are given by tps[-1] - tps)
fps : 1d ndarray
False positives counts, index i records the number
of negative samples that got assigned a
score >= thresholds[i].
The total number of negative samples is equal to
fps[-1] (thus true negatives are given by fps[-1] - fps)
thresholds : 1d ndarray
Predicted score sorted in decreasing order
References
----------
Github: scikit-learn _binary_clf_curve
- https://github.com/scikit-learn/scikit-learn/blob/ab93d65/sklearn/metrics/ranking.py#L263
"""
# sort predicted scores in descending order
# and also reorder corresponding truth values
desc_score_indices = np.argsort(y_score)[::-1]
y_score = y_score[desc_score_indices]
y_true = y_true[desc_score_indices]
# y_score typically consists of tied values. Here we extract
# the indices associated with the distinct values. We also
# concatenate a value for the end of the curve
distinct_indices = np.where(np.diff(y_score))[0]
end = np.array([y_true.size - 1])
threshold_indices = np.hstack((distinct_indices, end))
thresholds = y_score[threshold_indices]
tps = np.cumsum(y_true)[threshold_indices]
# (1 + threshold_indices) = the number of positives
# at each index, thus number of data points minus true
# positives = false positives
fps = (1 + threshold_indices) - tps
return tps, fps, thresholds
def _roc_auc_score(y_true, y_score):
"""
Compute Area Under the Curve (AUC) from prediction scores
Parameters
----------
y_true : 1d ndarray, shape = [n_samples]
True targets/labels of binary classification
y_score : 1d ndarray, shape = [n_samples]
Estimated probabilities or scores
Returns
-------
auc : float
"""
# ensure the target is binary
if np.unique(y_true).size != 2:
raise ValueError('Only two class should be present in y_true. ROC AUC score '
'is not defined in that case.')
tps, fps, _ = _binary_clf_curve(y_true, y_score)
# convert count to rate
tpr = tps / tps[-1]
fpr = fps / fps[-1]
# compute AUC using the trapezoidal rule;
# appending an extra 0 is just to ensure the length matches
zero = np.array([0])
tpr_diff = np.hstack((np.diff(tpr), zero))
fpr_diff = np.hstack((np.diff(fpr), zero))
auc = np.dot(tpr, fpr_diff) + np.dot(tpr_diff, fpr_diff) / 2
return auc
if __name__ == '__main__':
y_true = np.array([0, 0, 1, 1])
y_scores = np.array([0.1, 0.4, 0.35, 0.8])
print(_roc_auc_score(y_true, y_scores))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment