dondon2475848/information_retrieval_evaluation_ndcg.py

## information_retrieval_evaluation_ndcg.py
"""
程式參考自：
https://gist.github.com/bwhite/3726239
https://gist.github.com/gumption/b54278ec9bab2c0e0472816d1d7663be
差異：新增「 sum (2^rel_i - 1) / log2(i + 1) 」的版本
作者：Jie Dondon
版本：ndcg_dondon_20180201_v2
"""

import numpy as np

def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary as the previous methods.

    There is a typographical error on the formula referenced in the original definition of this function:
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    log2(i) should be log2(i+1)

    The formulas here are derived from
    https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Discounted_Cumulative_Gain

    The formulas return the same results when r contains only binary values
    >>> r = [2,3,2,3,1,1]
    >>> dcg_at_k(r,6,1)
    12.674304175856518
    >>> r = [3,3,2,2,1,1]
    >>> dcg_at_k(r,6,1)
    14.951597943562946


    Args:
        r: Relevance scores (list or numpy array) in rank order
            (first element is the most relevant item)
        k: Number of results to consider
        method: If 0 then sum rel_i / log2(i + 1) [not log2(i)]
                If 1 then sum (2^rel_i - 1) / log2(i + 1)
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        elif method == 1 :
            return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must in [0,1]')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from

    2013-Introduction to Information Retrieval Evaluation p.3
    (http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf)
    ↑ 此份文件的公式有誤，導致提供的結果也是錯誤的
    2012-微軟亞洲研究院-武威-機器學習及排序學習基礎 p.36

    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r,4,0)
    0.96519546960144276
    >>> r = [2,3,2,3,1,1]
    0.84768893757694552

    Args:
        r: Relevance scores (list or numpy array) in rank order
            (first element is the most relevant item)
        k: Number of results to consider
        method: If 0 then sum rel_i / log2(i + 1) [not log2(i)]
                If 1 then sum (2^rel_i - 1) / log2(i + 1)
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max
	"""
	程式參考自：
	https://gist.github.com/bwhite/3726239
	https://gist.github.com/gumption/b54278ec9bab2c0e0472816d1d7663be
	差異：新增「 sum (2^rel_i - 1) / log2(i + 1) 」的版本
	作者：Jie Dondon
	版本：ndcg_dondon_20180201_v2
	"""

	import numpy as np

	def dcg_at_k(r, k, method=0):
	"""Score is discounted cumulative gain (dcg)
	Relevance is positive real values. Can use binary as the previous methods.

	There is a typographical error on the formula referenced in the original definition of this function:
	http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
	log2(i) should be log2(i+1)

	The formulas here are derived from
	https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Discounted_Cumulative_Gain

	The formulas return the same results when r contains only binary values
	>>> r = [2,3,2,3,1,1]
	>>> dcg_at_k(r,6,1)
	12.674304175856518
	>>> r = [3,3,2,2,1,1]
	>>> dcg_at_k(r,6,1)
	14.951597943562946


	Args:
	r: Relevance scores (list or numpy array) in rank order
	(first element is the most relevant item)
	k: Number of results to consider
	method: If 0 then sum rel_i / log2(i + 1) [not log2(i)]
	If 1 then sum (2^rel_i - 1) / log2(i + 1)
	Returns:
	Discounted cumulative gain
	"""
	r = np.asfarray(r)[:k]
	if r.size:
	if method == 0:
	return np.sum(r / np.log2(np.arange(2, r.size + 2)))
	elif method == 1 :
	return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
	else:
	raise ValueError('method must in [0,1]')
	return 0.


	def ndcg_at_k(r, k, method=0):
	"""Score is normalized discounted cumulative gain (ndcg)
	Relevance is positive real values. Can use binary
	as the previous methods.
	Example from

	2013-Introduction to Information Retrieval Evaluation p.3
	(http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf)
	↑ 此份文件的公式有誤，導致提供的結果也是錯誤的
	2012-微軟亞洲研究院-武威-機器學習及排序學習基礎 p.36

	>>> r = [2, 1, 2, 0]
	>>> ndcg_at_k(r,4,0)
	0.96519546960144276
	>>> r = [2,3,2,3,1,1]
	0.84768893757694552

	Args:
	r: Relevance scores (list or numpy array) in rank order
	(first element is the most relevant item)
	k: Number of results to consider
	method: If 0 then sum rel_i / log2(i + 1) [not log2(i)]
	If 1 then sum (2^rel_i - 1) / log2(i + 1)
	Returns:
	Normalized discounted cumulative gain
	"""
	dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
	if not dcg_max:
	return 0.
	return dcg_at_k(r, k, method) / dcg_max