Kingkha/roc_auc_score.py

## roc_auc_score.py
# Source: http://ethen8181.github.io/machine-learning/model_selection/auc/auc.html#Implementation
def _binary_clf_curve(y_true, y_score):
    """
    Calculate true and false positives per binary classification
    threshold (can be used for roc curve or precision/recall curve);
    the calcuation makes the assumption that the positive case
    will always be labeled as 1

    Parameters
    ----------
    y_true : 1d ndarray, shape = [n_samples]
        True targets/labels of binary classification

    y_score : 1d ndarray, shape = [n_samples]
        Estimated probabilities or scores

    Returns
    -------
    tps : 1d ndarray
        True positives counts, index i records the number
        of positive samples that got assigned a
        score >= thresholds[i].
        The total number of positive samples is equal to
        tps[-1] (thus false negatives are given by tps[-1] - tps)

    fps : 1d ndarray
        False positives counts, index i records the number
        of negative samples that got assigned a
        score >= thresholds[i].
        The total number of negative samples is equal to
        fps[-1] (thus true negatives are given by fps[-1] - fps)

    thresholds : 1d ndarray
        Predicted score sorted in decreasing order

    References
    ----------
    Github: scikit-learn _binary_clf_curve
    - https://github.com/scikit-learn/scikit-learn/blob/ab93d65/sklearn/metrics/ranking.py#L263
    """

    # sort predicted scores in descending order
    # and also reorder corresponding truth values
    desc_score_indices = np.argsort(y_score)[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]

    # y_score typically consists of tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve
    distinct_indices = np.where(np.diff(y_score))[0]
    end = np.array([y_true.size - 1])
    threshold_indices = np.hstack((distinct_indices, end))

    thresholds = y_score[threshold_indices]
    tps = np.cumsum(y_true)[threshold_indices]

    # (1 + threshold_indices) = the number of positives
    # at each index, thus number of data points minus true
    # positives = false positives
    fps = (1 + threshold_indices) - tps
    return tps, fps, thresholds


def _roc_auc_score(y_true, y_score):
    """
    Compute Area Under the Curve (AUC) from prediction scores

    Parameters
    ----------
    y_true : 1d ndarray, shape = [n_samples]
        True targets/labels of binary classification

    y_score : 1d ndarray, shape = [n_samples]
        Estimated probabilities or scores

    Returns
    -------
    auc : float
    """

    # ensure the target is binary
    if np.unique(y_true).size != 2:
        raise ValueError('Only two class should be present in y_true. ROC AUC score '
                         'is not defined in that case.')

    tps, fps, _ = _binary_clf_curve(y_true, y_score)

    # convert count to rate
    tpr = tps / tps[-1]
    fpr = fps / fps[-1]

    # compute AUC using the trapezoidal rule;
    # appending an extra 0 is just to ensure the length matches
    zero = np.array([0])
    tpr_diff = np.hstack((np.diff(tpr), zero))
    fpr_diff = np.hstack((np.diff(fpr), zero))
    auc = np.dot(tpr, fpr_diff) + np.dot(tpr_diff, fpr_diff) / 2
    return auc


if __name__ == '__main__':
    y_true = np.array([0, 0, 1, 1])
    y_scores = np.array([0.1, 0.4, 0.35, 0.8])
    print(_roc_auc_score(y_true, y_scores))
	# Source: http://ethen8181.github.io/machine-learning/model_selection/auc/auc.html#Implementation
	def _binary_clf_curve(y_true, y_score):
	"""
	Calculate true and false positives per binary classification
	threshold (can be used for roc curve or precision/recall curve);
	the calcuation makes the assumption that the positive case
	will always be labeled as 1

	Parameters
	----------
	y_true : 1d ndarray, shape = [n_samples]
	True targets/labels of binary classification

	y_score : 1d ndarray, shape = [n_samples]
	Estimated probabilities or scores

	Returns
	-------
	tps : 1d ndarray
	True positives counts, index i records the number
	of positive samples that got assigned a
	score >= thresholds[i].
	The total number of positive samples is equal to
	tps[-1] (thus false negatives are given by tps[-1] - tps)

	fps : 1d ndarray
	False positives counts, index i records the number
	of negative samples that got assigned a
	score >= thresholds[i].
	The total number of negative samples is equal to
	fps[-1] (thus true negatives are given by fps[-1] - fps)

	thresholds : 1d ndarray
	Predicted score sorted in decreasing order

	References
	----------
	Github: scikit-learn _binary_clf_curve
	- https://github.com/scikit-learn/scikit-learn/blob/ab93d65/sklearn/metrics/ranking.py#L263
	"""

	# sort predicted scores in descending order
	# and also reorder corresponding truth values
	desc_score_indices = np.argsort(y_score)[::-1]
	y_score = y_score[desc_score_indices]
	y_true = y_true[desc_score_indices]

	# y_score typically consists of tied values. Here we extract
	# the indices associated with the distinct values. We also
	# concatenate a value for the end of the curve
	distinct_indices = np.where(np.diff(y_score))[0]
	end = np.array([y_true.size - 1])
	threshold_indices = np.hstack((distinct_indices, end))

	thresholds = y_score[threshold_indices]
	tps = np.cumsum(y_true)[threshold_indices]

	# (1 + threshold_indices) = the number of positives
	# at each index, thus number of data points minus true
	# positives = false positives
	fps = (1 + threshold_indices) - tps
	return tps, fps, thresholds


	def _roc_auc_score(y_true, y_score):
	"""
	Compute Area Under the Curve (AUC) from prediction scores

	Parameters
	----------
	y_true : 1d ndarray, shape = [n_samples]
	True targets/labels of binary classification

	y_score : 1d ndarray, shape = [n_samples]
	Estimated probabilities or scores

	Returns
	-------
	auc : float
	"""

	# ensure the target is binary
	if np.unique(y_true).size != 2:
	raise ValueError('Only two class should be present in y_true. ROC AUC score '
	'is not defined in that case.')

	tps, fps, _ = _binary_clf_curve(y_true, y_score)

	# convert count to rate
	tpr = tps / tps[-1]
	fpr = fps / fps[-1]

	# compute AUC using the trapezoidal rule;
	# appending an extra 0 is just to ensure the length matches
	zero = np.array([0])
	tpr_diff = np.hstack((np.diff(tpr), zero))
	fpr_diff = np.hstack((np.diff(fpr), zero))
	auc = np.dot(tpr, fpr_diff) + np.dot(tpr_diff, fpr_diff) / 2
	return auc


	if __name__ == '__main__':
	y_true = np.array([0, 0, 1, 1])
	y_scores = np.array([0.1, 0.4, 0.35, 0.8])
	print(_roc_auc_score(y_true, y_scores))