Instantly share code, notes, and snippets.

# mblondel/letor_metrics.py

Last active Jul 30, 2020
Learning to rank metrics.
 # (C) Mathieu Blondel, November 2013 # License: BSD 3 clause import numpy as np def ranking_precision_score(y_true, y_score, k=10): """Precision at rank k Parameters ---------- y_true : array-like, shape = [n_samples] Ground truth (true relevance labels). y_score : array-like, shape = [n_samples] Predicted scores. k : int Rank. Returns ------- precision @k : float """ unique_y = np.unique(y_true) if len(unique_y) > 2: raise ValueError("Only supported for two relevance levels.") pos_label = unique_y n_pos = np.sum(y_true == pos_label) order = np.argsort(y_score)[::-1] y_true = np.take(y_true, order[:k]) n_relevant = np.sum(y_true == pos_label) # Divide by min(n_pos, k) such that the best achievable score is always 1.0. return float(n_relevant) / min(n_pos, k) def average_precision_score(y_true, y_score, k=10): """Average precision at rank k Parameters ---------- y_true : array-like, shape = [n_samples] Ground truth (true relevance labels). y_score : array-like, shape = [n_samples] Predicted scores. k : int Rank. Returns ------- average precision @k : float """ unique_y = np.unique(y_true) if len(unique_y) > 2: raise ValueError("Only supported for two relevance levels.") pos_label = unique_y n_pos = np.sum(y_true == pos_label) order = np.argsort(y_score)[::-1][:min(n_pos, k)] y_true = np.asarray(y_true)[order] score = 0 for i in xrange(len(y_true)): if y_true[i] == pos_label: # Compute precision up to document i # i.e, percentage of relevant documents up to document i. prec = 0 for j in xrange(0, i + 1): if y_true[j] == pos_label: prec += 1.0 prec /= (i + 1.0) score += prec if n_pos == 0: return 0 return score / n_pos def dcg_score(y_true, y_score, k=10, gains="exponential"): """Discounted cumulative gain (DCG) at rank k Parameters ---------- y_true : array-like, shape = [n_samples] Ground truth (true relevance labels). y_score : array-like, shape = [n_samples] Predicted scores. k : int Rank. gains : str Whether gains should be "exponential" (default) or "linear". Returns ------- DCG @k : float """ order = np.argsort(y_score)[::-1] y_true = np.take(y_true, order[:k]) if gains == "exponential": gains = 2 ** y_true - 1 elif gains == "linear": gains = y_true else: raise ValueError("Invalid gains option.") # highest rank is 1 so +2 instead of +1 discounts = np.log2(np.arange(len(y_true)) + 2) return np.sum(gains / discounts) def ndcg_score(y_true, y_score, k=10, gains="exponential"): """Normalized discounted cumulative gain (NDCG) at rank k Parameters ---------- y_true : array-like, shape = [n_samples] Ground truth (true relevance labels). y_score : array-like, shape = [n_samples] Predicted scores. k : int Rank. gains : str Whether gains should be "exponential" (default) or "linear". Returns ------- NDCG @k : float """ best = dcg_score(y_true, y_true, k, gains) actual = dcg_score(y_true, y_score, k, gains) return actual / best # Alternative API. def dcg_from_ranking(y_true, ranking): """Discounted cumulative gain (DCG) at rank k Parameters ---------- y_true : array-like, shape = [n_samples] Ground truth (true relevance labels). ranking : array-like, shape = [k] Document indices, i.e., ranking is the index of top-ranked document, ranking is the index of second-ranked document, ... k : int Rank. Returns ------- DCG @k : float """ y_true = np.asarray(y_true) ranking = np.asarray(ranking) rel = y_true[ranking] gains = 2 ** rel - 1 discounts = np.log2(np.arange(len(ranking)) + 2) return np.sum(gains / discounts) def ndcg_from_ranking(y_true, ranking): """Normalized discounted cumulative gain (NDCG) at rank k Parameters ---------- y_true : array-like, shape = [n_samples] Ground truth (true relevance labels). ranking : array-like, shape = [k] Document indices, i.e., ranking is the index of top-ranked document, ranking is the index of second-ranked document, ... k : int Rank. Returns ------- NDCG @k : float """ k = len(ranking) best_ranking = np.argsort(y_true)[::-1] best = dcg_from_ranking(y_true, best_ranking[:k]) return dcg_from_ranking(y_true, ranking) / best if __name__ == '__main__': # Check that some rankings are better than others assert dcg_score([5, 3, 2], [2, 1, 0]) > dcg_score([4, 3, 2], [2, 1, 0]) assert dcg_score([4, 3, 2], [2, 1, 0]) > dcg_score([1, 3, 2], [2, 1, 0]) assert dcg_score([5, 3, 2], [2, 1, 0], k=2) > dcg_score([4, 3, 2], [2, 1, 0], k=2) assert dcg_score([4, 3, 2], [2, 1, 0], k=2) > dcg_score([1, 3, 2], [2, 1, 0], k=2) # Perfect rankings assert ndcg_score([5, 3, 2], [2, 1, 0]) == 1.0 assert ndcg_score([2, 3, 5], [0, 1, 2]) == 1.0 assert ndcg_from_ranking([5, 3, 2], [0, 1, 2]) == 1.0 assert ndcg_score([5, 3, 2], [2, 1, 0], k=2) == 1.0 assert ndcg_score([2, 3, 5], [0, 1, 2], k=2) == 1.0 assert ndcg_from_ranking([5, 3, 2], [0, 1]) == 1.0 # Check that sample order is irrelevant assert dcg_score([5, 3, 2], [2, 1, 0]) == dcg_score([2, 3, 5], [0, 1, 2]) assert dcg_score([5, 3, 2], [2, 1, 0], k=2) == dcg_score([2, 3, 5], [0, 1, 2], k=2) # Check equivalence between two interfaces. assert dcg_score([5, 3, 2], [2, 1, 0]) == dcg_from_ranking([5, 3, 2], [0, 1, 2]) assert dcg_score([1, 3, 2], [2, 1, 0]) == dcg_from_ranking([1, 3, 2], [0, 1, 2]) assert dcg_score([1, 3, 2], [0, 2, 1]) == dcg_from_ranking([1, 3, 2], [1, 2, 0]) assert ndcg_score([1, 3, 2], [2, 1, 0]) == ndcg_from_ranking([1, 3, 2], [0, 1, 2]) assert dcg_score([5, 3, 2], [2, 1, 0], k=2) == dcg_from_ranking([5, 3, 2], [0, 1]) assert dcg_score([1, 3, 2], [2, 1, 0], k=2) == dcg_from_ranking([1, 3, 2], [0, 1]) assert dcg_score([1, 3, 2], [0, 2, 1], k=2) == dcg_from_ranking([1, 3, 2], [1, 2]) assert ndcg_score([1, 3, 2], [2, 1, 0], k=2) == \ ndcg_from_ranking([1, 3, 2], [0, 1]) # Precision assert ranking_precision_score([1, 1, 0], [3, 2, 1], k=2) == 1.0 assert ranking_precision_score([1, 1, 0], [1, 0, 0.5], k=2) == 0.5 assert ranking_precision_score([1, 1, 0], [3, 2, 1], k=3) == \ ranking_precision_score([1, 1, 0], [1, 0, 0.5], k=3) # Average precision from sklearn.metrics import average_precision_score as ap assert average_precision_score([1, 1, 0], [3, 2, 1]) == ap([1, 1, 0], [3, 2, 1]) assert average_precision_score([1, 1, 0], [3, 1, 0]) == ap([1, 1, 0], [3, 1, 0])

### kmike commented Jun 24, 2016

 👍 Nice implementation with a nice API! Are there plans to publish these snippets as a Python module?

### kamalbanga commented Jan 25, 2017

 The step `gains = 2 ** rel - 1` in the method `dcg_from_ranking` is an issue as for even as small values as 70 since it overflows, e.g., `ndcg_score([10,1,70], [2, 1, 0])` gives 1.59.

### Jomonsugi commented Aug 21, 2017

 I am assuming the ndcg_score y_true should be binary (ground truth), but am unsure what format y_score should be in. When calculating auc for example the y_true would be binary and the y_score does not have to be any given range. I could feed a vector of any range. The y_score I have is the output from an ALS collaborative model. For example: [ 1.09253478e-01 1.97033856e-01 5.51080274e-01 ..., 1.77992064e-03 1.90066773e-12 1.74711004e-04] Should I scale this vector in some way before using it as an argument to the ndcg_score function?

### rola93 commented May 19, 2018

 Nice implementation! I'd love if they were part of sckit! perhaps I'd use pos_label as a parameter with default value 1, as in other metrics from sckit

### eggie5 commented Jun 23, 2018

 I like your adaptive dominator in the precision@k code! Takes care of one of the precision@k weaknesses when the number of relevant documents are less than `k`.

### senjed commented Oct 5, 2018

 hey! what do you mean by "true relevance labels"? is it the relevance score (unnormalized score showing the relevance of an item to a query, the higher the more relevant) or the position of the item in the result (zero being the highest shown) imagine an item is the most relevant does it have a high value or low value ?

### tobiasraabe commented Jan 24, 2019

 Hi, thanks for the implementation, but I might caught an error in `precision_at_k`. Precision at k is calculated as the ratio between the number of correct classified samples divided by k or the total number of samples - whatever is smaller. Your implementation divides by `n_pos = np.sum(y_true == pos_label)` the total number of positive samples. This leads to wrong precision values if `k > n_pos`. It is easy to verify if you consider the case where all samples are classified as positive samples. Then, `float(n_relevant) / min(n_pos, k)` is 1 and not the ratio of positive samples to all samples. Please correct me if I am wrong.

### AsiehH commented May 13, 2019 • edited

 Hi, The sklearn metric `sklearn.metrics.average_precision_score` is different from what you defined above. It does not depend on k since it is average precision not average precision at k. Here are a few counter examples. `print(average_precision_score([1, 0, 1], [3, 2, 1]) == ap([1, 0, 1], [3, 2, 1]))` `False` `print(average_precision_score([1, 1, 1, 0], [3, 2, 1,4]) == ap([1, 1, 1, 0], [3, 2, 1,4]))` `False` `print(average_precision_score([1, 1, 1, 0], [3, 2, 4,1],k=2) == ap([1, 1, 1, 0], [3, 2, 4,1]))` `False` `print(average_precision_score([1, 1, 1, 0], [3, 2, 4,1],k=3) == ap([1, 1, 1, 0], [3, 2, 4,1]))` `True` I found several codes online for average precision, average precision at k and mean average precision at k and almost all of them give different values. Is there any text reference that defines how these are calculated with an example that you can reference?

### kamalbanga commented Jan 31, 2020 • edited

 What should be value of `ndcg_score(, )`? It gives following warning since `best` & `actual` dcg_scores are both `0`. RuntimeWarning: invalid value encountered in double_scalars return actual / best But shouldn't it be just `1.0`? I believe, whenever `best_dcg_score` is `0`, `ndcg_score` should be `1.0.`.