Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Ranking Metrics
"""Information Retrieval metrics
Useful Resources:
http://www.cs.utexas.edu/~mooney/ir-course/slides/Evaluation.ppt
http://www.nii.ac.jp/TechReports/05-014E.pdf
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
http://hal.archives-ouvertes.fr/docs/00/72/67/60/PDF/07-busa-fekete.pdf
Learning to Rank for Information Retrieval (Tie-Yan Liu)
"""
import numpy as np
def mean_reciprocal_rank(rs):
"""Score is reciprocal of the rank of the first relevant item
First element is 'rank 1'. Relevance is binary (nonzero is relevant).
Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
>>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
>>> mean_reciprocal_rank(rs)
0.61111111111111105
>>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
>>> mean_reciprocal_rank(rs)
0.5
>>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
>>> mean_reciprocal_rank(rs)
0.75
Args:
rs: Iterator of relevance scores (list or numpy) in rank order
(first element is the first item)
Returns:
Mean reciprocal rank
"""
rs = (np.asarray(r).nonzero()[0] for r in rs)
return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
def r_precision(r):
"""Score is precision after all relevant documents have been retrieved
Relevance is binary (nonzero is relevant).
>>> r = [0, 0, 1]
>>> r_precision(r)
0.33333333333333331
>>> r = [0, 1, 0]
>>> r_precision(r)
0.5
>>> r = [1, 0, 0]
>>> r_precision(r)
1.0
Args:
r: Relevance scores (list or numpy) in rank order
(first element is the first item)
Returns:
R Precision
"""
r = np.asarray(r) != 0
z = r.nonzero()[0]
if not z.size:
return 0.
return np.mean(r[:z[-1] + 1])
def precision_at_k(r, k):
"""Score is precision @ k
Relevance is binary (nonzero is relevant).
>>> r = [0, 0, 1]
>>> precision_at_k(r, 1)
0.0
>>> precision_at_k(r, 2)
0.0
>>> precision_at_k(r, 3)
0.33333333333333331
>>> precision_at_k(r, 4)
Traceback (most recent call last):
File "<stdin>", line 1, in ?
ValueError: Relevance score length < k
Args:
r: Relevance scores (list or numpy) in rank order
(first element is the first item)
Returns:
Precision @ k
Raises:
ValueError: len(r) must be >= k
"""
assert k >= 1
r = np.asarray(r)[:k] != 0
if r.size != k:
raise ValueError('Relevance score length < k')
return np.mean(r)
def average_precision(r):
"""Score is average precision (area under PR curve)
Relevance is binary (nonzero is relevant).
>>> r = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1]
>>> delta_r = 1. / sum(r)
>>> sum([sum(r[:x + 1]) / (x + 1.) * delta_r for x, y in enumerate(r) if y])
0.7833333333333333
>>> average_precision(r)
0.78333333333333333
Args:
r: Relevance scores (list or numpy) in rank order
(first element is the first item)
Returns:
Average precision
"""
r = np.asarray(r) != 0
out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
if not out:
return 0.
return np.mean(out)
def mean_average_precision(rs):
"""Score is mean average precision
Relevance is binary (nonzero is relevant).
>>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1]]
>>> mean_average_precision(rs)
0.78333333333333333
>>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1], [0]]
>>> mean_average_precision(rs)
0.39166666666666666
Args:
rs: Iterator of relevance scores (list or numpy) in rank order
(first element is the first item)
Returns:
Mean average precision
"""
return np.mean([average_precision(r) for r in rs])
def dcg_at_k(r, k, method=0):
"""Score is discounted cumulative gain (dcg)
Relevance is positive real values. Can use binary
as the previous methods.
Example from
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
>>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
>>> dcg_at_k(r, 1)
3.0
>>> dcg_at_k(r, 1, method=1)
3.0
>>> dcg_at_k(r, 2)
5.0
>>> dcg_at_k(r, 2, method=1)
4.2618595071429155
>>> dcg_at_k(r, 10)
9.6051177391888114
>>> dcg_at_k(r, 11)
9.6051177391888114
Args:
r: Relevance scores (list or numpy) in rank order
(first element is the first item)
k: Number of results to consider
method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
Returns:
Discounted cumulative gain
"""
r = np.asfarray(r)[:k]
if r.size:
if method == 0:
return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
elif method == 1:
return np.sum(r / np.log2(np.arange(2, r.size + 2)))
else:
raise ValueError('method must be 0 or 1.')
return 0.
def ndcg_at_k(r, k, method=0):
"""Score is normalized discounted cumulative gain (ndcg)
Relevance is positive real values. Can use binary
as the previous methods.
Example from
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
>>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
>>> ndcg_at_k(r, 1)
1.0
>>> r = [2, 1, 2, 0]
>>> ndcg_at_k(r, 4)
0.9203032077642922
>>> ndcg_at_k(r, 4, method=1)
0.96519546960144276
>>> ndcg_at_k([0], 1)
0.0
>>> ndcg_at_k([1], 2)
1.0
Args:
r: Relevance scores (list or numpy) in rank order
(first element is the first item)
k: Number of results to consider
method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
Returns:
Normalized discounted cumulative gain
"""
dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
if not dcg_max:
return 0.
return dcg_at_k(r, k, method) / dcg_max
if __name__ == "__main__":
import doctest
doctest.testmod()
@teh

This comment has been minimized.

Copy link

commented Jul 27, 2015

Hey, this is a nice implementation for those metrics! Would you mind clarifying the license by sticking a license header on top? Thanks in any case :)

@jstypka

This comment has been minimized.

Copy link

commented Feb 4, 2016

great work!

@kdqzzxxcc

This comment has been minimized.

Copy link

commented Mar 2, 2016

nicely done!

@duongtrung

This comment has been minimized.

Copy link

commented Jun 24, 2016

nice work

@kn45

This comment has been minimized.

Copy link

commented Oct 31, 2016

great!! and im looking for pairwise AUC

@kell18

This comment has been minimized.

Copy link

commented Feb 5, 2017

Thanks, nice work!

But NDCG for recommender systems evaluation needs to account whole list of user ratings i.e. ratings that where not included in recommendation delivery. My implementation of recsys NDCG.

@peustr

This comment has been minimized.

Copy link

commented May 24, 2017

Here's my small contribution; nDCG as defined by Kaggle: https://www.kaggle.com/wiki/NormalizedDiscountedCumulativeGain
The only difference is: r → 2r - 1

import numpy as np


def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg
@debmalya97

This comment has been minimized.

Copy link

commented Jun 26, 2017

Hey I am a beginner and I was trying to find NDCG score for my similarity matrix for 5 iterations.But the error coming is" the truth value of an array with more than one element is ambiguous. Use a.any() or a.all()" .Could somebody help me out by what this means?Thanks a ton

@suzanv

This comment has been minimized.

Copy link

commented Jul 19, 2017

I agree with Kell18. That does not only hold for recommender systems but also for IR: the ideal ranking should be the ranking of all judged items in the collection for the query. Not only the retrieved ones.

@Jomonsugi

This comment has been minimized.

Copy link

commented Aug 22, 2017

Can you give me an idea of how to use your function if I have a vector of binary (ground truth) labels and then an output from an ALS model, for example: [ 1.09253478e-01 1.97033856e-01 5.51080274e-01 ..., 1.77992064e-03 1.90066773e-12 1.74711004e-04]

When evaluation my model using AUC, I can just feed in the binary ground truth vector and the output from my ALS model as the predicted scores as is, but I am wondering how this would work with your model if I am considering, for example, k=10 recommendations and would like to use NDCG to evaluate the output.

@gumption

This comment has been minimized.

Copy link

commented Nov 17, 2017

The reference URL in the comments for ndcg_at_k does not work. I believe this is the current URL for the referenced document: http://web.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf

@dondon2475848

This comment has been minimized.

Copy link

commented Feb 1, 2018

There is a typographical error on the formula referenced in the original definition of this function:
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
log2(i) should be log2(i+1)

The formulas here are derived from
https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Discounted_Cumulative_Gain

@gumption find this issue, see more:
https://gist.github.com/gumption/b54278ec9bab2c0e0472816d1d7663be

@romanegloo

This comment has been minimized.

Copy link

commented May 23, 2018

+1 to kell18 and suzanv. quote from wiki on ideal DCG:

sorting all relevant documents in the corpus by their relative relevance, producing the maximum possible DCG through position p, also called Ideal DCG (IDCG) through that position.

@estathop

This comment has been minimized.

Copy link

commented Sep 27, 2018

how about recall@K ?

@lucidyan

This comment has been minimized.

Copy link

commented Mar 3, 2019

@bwhite
I think you have an error in the average_precision metric.

So for example:

from rank_metrics import average_precision

relevance_list = [[1, 1, 1], [1, 1, 0], [1, 0, 0]]

for r in relevance_list:
    print(average_precision(r))

Will print:

1.0
1.0
1.0

Instead of:

1.
0.6666666666666666
0.3333333333333333

So in the metric's return you should replace np.mean(out) with np.sum(out) / len(r)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.