# (C) Mathieu Blondel, November 2013 | |
# License: BSD 3 clause | |
import numpy as np | |
def ranking_precision_score(y_true, y_score, k=10): | |
"""Precision at rank k | |
Parameters | |
---------- | |
y_true : array-like, shape = [n_samples] | |
Ground truth (true relevance labels). | |
y_score : array-like, shape = [n_samples] | |
Predicted scores. | |
k : int | |
Rank. | |
Returns | |
------- | |
precision @k : float | |
""" | |
unique_y = np.unique(y_true) | |
if len(unique_y) > 2: | |
raise ValueError("Only supported for two relevance levels.") | |
pos_label = unique_y[1] | |
n_pos = np.sum(y_true == pos_label) | |
order = np.argsort(y_score)[::-1] | |
y_true = np.take(y_true, order[:k]) | |
n_relevant = np.sum(y_true == pos_label) | |
# Divide by min(n_pos, k) such that the best achievable score is always 1.0. | |
return float(n_relevant) / min(n_pos, k) | |
def average_precision_score(y_true, y_score, k=10): | |
"""Average precision at rank k | |
Parameters | |
---------- | |
y_true : array-like, shape = [n_samples] | |
Ground truth (true relevance labels). | |
y_score : array-like, shape = [n_samples] | |
Predicted scores. | |
k : int | |
Rank. | |
Returns | |
------- | |
average precision @k : float | |
""" | |
unique_y = np.unique(y_true) | |
if len(unique_y) > 2: | |
raise ValueError("Only supported for two relevance levels.") | |
pos_label = unique_y[1] | |
n_pos = np.sum(y_true == pos_label) | |
order = np.argsort(y_score)[::-1][:min(n_pos, k)] | |
y_true = np.asarray(y_true)[order] | |
score = 0 | |
for i in xrange(len(y_true)): | |
if y_true[i] == pos_label: | |
# Compute precision up to document i | |
# i.e, percentage of relevant documents up to document i. | |
prec = 0 | |
for j in xrange(0, i + 1): | |
if y_true[j] == pos_label: | |
prec += 1.0 | |
prec /= (i + 1.0) | |
score += prec | |
if n_pos == 0: | |
return 0 | |
return score / n_pos | |
def dcg_score(y_true, y_score, k=10, gains="exponential"): | |
"""Discounted cumulative gain (DCG) at rank k | |
Parameters | |
---------- | |
y_true : array-like, shape = [n_samples] | |
Ground truth (true relevance labels). | |
y_score : array-like, shape = [n_samples] | |
Predicted scores. | |
k : int | |
Rank. | |
gains : str | |
Whether gains should be "exponential" (default) or "linear". | |
Returns | |
------- | |
DCG @k : float | |
""" | |
order = np.argsort(y_score)[::-1] | |
y_true = np.take(y_true, order[:k]) | |
if gains == "exponential": | |
gains = 2 ** y_true - 1 | |
elif gains == "linear": | |
gains = y_true | |
else: | |
raise ValueError("Invalid gains option.") | |
# highest rank is 1 so +2 instead of +1 | |
discounts = np.log2(np.arange(len(y_true)) + 2) | |
return np.sum(gains / discounts) | |
def ndcg_score(y_true, y_score, k=10, gains="exponential"): | |
"""Normalized discounted cumulative gain (NDCG) at rank k | |
Parameters | |
---------- | |
y_true : array-like, shape = [n_samples] | |
Ground truth (true relevance labels). | |
y_score : array-like, shape = [n_samples] | |
Predicted scores. | |
k : int | |
Rank. | |
gains : str | |
Whether gains should be "exponential" (default) or "linear". | |
Returns | |
------- | |
NDCG @k : float | |
""" | |
best = dcg_score(y_true, y_true, k, gains) | |
actual = dcg_score(y_true, y_score, k, gains) | |
return actual / best | |
# Alternative API. | |
def dcg_from_ranking(y_true, ranking): | |
"""Discounted cumulative gain (DCG) at rank k | |
Parameters | |
---------- | |
y_true : array-like, shape = [n_samples] | |
Ground truth (true relevance labels). | |
ranking : array-like, shape = [k] | |
Document indices, i.e., | |
ranking[0] is the index of top-ranked document, | |
ranking[1] is the index of second-ranked document, | |
... | |
k : int | |
Rank. | |
Returns | |
------- | |
DCG @k : float | |
""" | |
y_true = np.asarray(y_true) | |
ranking = np.asarray(ranking) | |
rel = y_true[ranking] | |
gains = 2 ** rel - 1 | |
discounts = np.log2(np.arange(len(ranking)) + 2) | |
return np.sum(gains / discounts) | |
def ndcg_from_ranking(y_true, ranking): | |
"""Normalized discounted cumulative gain (NDCG) at rank k | |
Parameters | |
---------- | |
y_true : array-like, shape = [n_samples] | |
Ground truth (true relevance labels). | |
ranking : array-like, shape = [k] | |
Document indices, i.e., | |
ranking[0] is the index of top-ranked document, | |
ranking[1] is the index of second-ranked document, | |
... | |
k : int | |
Rank. | |
Returns | |
------- | |
NDCG @k : float | |
""" | |
k = len(ranking) | |
best_ranking = np.argsort(y_true)[::-1] | |
best = dcg_from_ranking(y_true, best_ranking[:k]) | |
return dcg_from_ranking(y_true, ranking) / best | |
if __name__ == '__main__': | |
# Check that some rankings are better than others | |
assert dcg_score([5, 3, 2], [2, 1, 0]) > dcg_score([4, 3, 2], [2, 1, 0]) | |
assert dcg_score([4, 3, 2], [2, 1, 0]) > dcg_score([1, 3, 2], [2, 1, 0]) | |
assert dcg_score([5, 3, 2], [2, 1, 0], k=2) > dcg_score([4, 3, 2], [2, 1, 0], k=2) | |
assert dcg_score([4, 3, 2], [2, 1, 0], k=2) > dcg_score([1, 3, 2], [2, 1, 0], k=2) | |
# Perfect rankings | |
assert ndcg_score([5, 3, 2], [2, 1, 0]) == 1.0 | |
assert ndcg_score([2, 3, 5], [0, 1, 2]) == 1.0 | |
assert ndcg_from_ranking([5, 3, 2], [0, 1, 2]) == 1.0 | |
assert ndcg_score([5, 3, 2], [2, 1, 0], k=2) == 1.0 | |
assert ndcg_score([2, 3, 5], [0, 1, 2], k=2) == 1.0 | |
assert ndcg_from_ranking([5, 3, 2], [0, 1]) == 1.0 | |
# Check that sample order is irrelevant | |
assert dcg_score([5, 3, 2], [2, 1, 0]) == dcg_score([2, 3, 5], [0, 1, 2]) | |
assert dcg_score([5, 3, 2], [2, 1, 0], k=2) == dcg_score([2, 3, 5], [0, 1, 2], k=2) | |
# Check equivalence between two interfaces. | |
assert dcg_score([5, 3, 2], [2, 1, 0]) == dcg_from_ranking([5, 3, 2], [0, 1, 2]) | |
assert dcg_score([1, 3, 2], [2, 1, 0]) == dcg_from_ranking([1, 3, 2], [0, 1, 2]) | |
assert dcg_score([1, 3, 2], [0, 2, 1]) == dcg_from_ranking([1, 3, 2], [1, 2, 0]) | |
assert ndcg_score([1, 3, 2], [2, 1, 0]) == ndcg_from_ranking([1, 3, 2], [0, 1, 2]) | |
assert dcg_score([5, 3, 2], [2, 1, 0], k=2) == dcg_from_ranking([5, 3, 2], [0, 1]) | |
assert dcg_score([1, 3, 2], [2, 1, 0], k=2) == dcg_from_ranking([1, 3, 2], [0, 1]) | |
assert dcg_score([1, 3, 2], [0, 2, 1], k=2) == dcg_from_ranking([1, 3, 2], [1, 2]) | |
assert ndcg_score([1, 3, 2], [2, 1, 0], k=2) == \ | |
ndcg_from_ranking([1, 3, 2], [0, 1]) | |
# Precision | |
assert ranking_precision_score([1, 1, 0], [3, 2, 1], k=2) == 1.0 | |
assert ranking_precision_score([1, 1, 0], [1, 0, 0.5], k=2) == 0.5 | |
assert ranking_precision_score([1, 1, 0], [3, 2, 1], k=3) == \ | |
ranking_precision_score([1, 1, 0], [1, 0, 0.5], k=3) | |
# Average precision | |
from sklearn.metrics import average_precision_score as ap | |
assert average_precision_score([1, 1, 0], [3, 2, 1]) == ap([1, 1, 0], [3, 2, 1]) | |
assert average_precision_score([1, 1, 0], [3, 1, 0]) == ap([1, 1, 0], [3, 1, 0]) |
This comment has been minimized.
This comment has been minimized.
The step |
This comment has been minimized.
This comment has been minimized.
I am assuming the ndcg_score y_true should be binary (ground truth), but am unsure what format y_score should be in. When calculating auc for example the y_true would be binary and the y_score does not have to be any given range. I could feed a vector of any range. The y_score I have is the output from an ALS collaborative model. For example: [ 1.09253478e-01 1.97033856e-01 5.51080274e-01 ..., 1.77992064e-03 1.90066773e-12 1.74711004e-04] Should I scale this vector in some way before using it as an argument to the ndcg_score function? |
This comment has been minimized.
This comment has been minimized.
Nice implementation! I'd love if they were part of sckit! perhaps I'd use pos_label as a parameter with default value 1, as in other metrics from sckit |
This comment has been minimized.
This comment has been minimized.
I like your adaptive dominator in the precision@k code! Takes care of one of the precision@k weaknesses when the number of relevant documents are less than |
This comment has been minimized.
This comment has been minimized.
hey! what do you mean by "true relevance labels"? is it the relevance score (unnormalized score showing the relevance of an item to a query, the higher the more relevant) or the position of the item in the result (zero being the highest shown) imagine an item is the most relevant does it have a high value or low value ? |
This comment has been minimized.
This comment has been minimized.
Hi, thanks for the implementation, but I might caught an error in |
This comment has been minimized.
This comment has been minimized.
Hi,
I found several codes online for average precision, average precision at k and mean average precision at k and almost all of them give different values. Is there any text reference that defines how these are calculated with an example that you can reference? |
This comment has been minimized.
This comment has been minimized.
What should be value of
But shouldn't it be just I believe, whenever |
This comment has been minimized.
This comment has been minimized.
Thanks for your code. It is really helpful! In "ranking_precision_score", I was wondering why you changed "return float(n_relevant)/k" to "return float(n_relevant)/min(n_pos, k)", and why it is important for "divide by min(n_pos, k) such that the best achievable score is always 1.0". Is there any reference (e.g., papers, books, or other reliable resources) to this change, regarding "return float(n_relevant)/min(n_pos, k)"? |
This comment has been minimized.