pamelafox/sortable.py

## sortable.py
"""Scores for any type of entity.

This is used to rank entities, for example the list of top forks of a
scratchpad and the current "hottest" scratchpads.
"""

import math
import datetime
import functools

from google.appengine.ext import ndb


__all__ = ["TimeIndependentScoreProperty", "TimeDependentScoreProperty"]


def epoch_seconds(date):
    """Returns the number of seconds from the epoch to date."""
    epoch = datetime.datetime(1970, 1, 1)
    return (date - epoch).total_seconds()


def wilson_confidence(upvotes_name, downvotes_name, score):
    """Lower bound of Wilson score 90% confidence interval.

    This is the algorithm Reddit uses to sort comments.

    You should not use this if downvotes are disallowed - it is only useful in
    the presence of both upvotes and downvotes because its ranking is based on
    an estimate of the ratio of upvotes to downvotes.

    See http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
    """
    upvotes = getattr(score, upvotes_name)
    downvotes = getattr(score, downvotes_name)

    if upvotes == 0:
        return -downvotes
    elif upvotes == downvotes:
        return 0

    n = upvotes + downvotes
    z = 1.64485  # 90% confidence z-score
    phat = float(upvotes) / n  # p-hat

    return ((phat + z * z / (2 * n) - z *
        math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n))
        / (1 + z * z / n))


def time_dependent(decay_seconds, upvotes_name, downvotes_name, created_name,
        score):
    """Ranking based on both age and quality.

    This is the algorithm Reddit uses to sort stories. We want there to be
    churn, a constant stream of new stories hitting the top scratchpads page,
    so this algorithm takes into account both the score of the scratchpad as
    well as the age.

    See http://amix.dk/blog/post/19588
    """
    s = getattr(score, upvotes_name) - getattr(score, downvotes_name)

    # Weight votes logarithmically - initial votes are worth a ton
    order = math.log(max(abs(s), 1), 10)

    # signum
    sign = 1 if s > 0 else -1 if s < 0 else 0

    # Seconds since this algorithm's start date (when I was writing this
    # code)
    date = getattr(score, created_name) or datetime.datetime.now()
    seconds = epoch_seconds(date) - 1349108492

    return round(order + sign * seconds / decay_seconds, 7)


class TimeIndependentScoreProperty(ndb.ComputedProperty):
    """A score of the value of an entity, based solely on votes, not on age.

    This uses the same algorithm as Reddit uses for comments - designed to
    promote the highest quality to the top independent of age. It estimates the
    "real" value of an entity, with 90% confidence, based on the current votes.

    upvotes_name: The name of the count of upvotes on your model.

    downvotes_name: The name of the count of downvotes on your model.
    """
    def __init__(self, upvotes_name="upvotes", downvotes_name="downvotes",
            **kwargs):
        super(TimeIndependentScoreProperty, self).__init__(
            functools.partial(wilson_confidence, upvotes_name, downvotes_name),
            **kwargs)


class TimeDependentScoreProperty(ndb.ComputedProperty):
    """A score of the "hotness" of an entity.

    Think of the frontpage of HN or Reddit - in fact it uses the same algorithm
    as Reddit uses for submissions. Use this when you are looking for churn - a
    constant stream of content balancing newness and quality.

    upvotes_name: The name of the count of upvotes on your model.

    downvotes_name: The name of the count of downvotes on your model.

    created_name: The name of the creation datetime on your model.
    """
    def __init__(self, decay_seconds, upvotes_name="upvotes",
            downvotes_name="downvotes", created_name="created", **kwargs):
        super(TimeDependentScoreProperty, self).__init__(functools.partial(
            time_dependent, decay_seconds, upvotes_name, downvotes_name,
            created_name), **kwargs)
	"""Scores for any type of entity.

	This is used to rank entities, for example the list of top forks of a
	scratchpad and the current "hottest" scratchpads.
	"""

	import math
	import datetime
	import functools

	from google.appengine.ext import ndb


	__all__ = ["TimeIndependentScoreProperty", "TimeDependentScoreProperty"]


	def epoch_seconds(date):
	"""Returns the number of seconds from the epoch to date."""
	epoch = datetime.datetime(1970, 1, 1)
	return (date - epoch).total_seconds()


	def wilson_confidence(upvotes_name, downvotes_name, score):
	"""Lower bound of Wilson score 90% confidence interval.

	This is the algorithm Reddit uses to sort comments.

	You should not use this if downvotes are disallowed - it is only useful in
	the presence of both upvotes and downvotes because its ranking is based on
	an estimate of the ratio of upvotes to downvotes.

	See http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
	"""
	upvotes = getattr(score, upvotes_name)
	downvotes = getattr(score, downvotes_name)

	if upvotes == 0:
	return -downvotes
	elif upvotes == downvotes:
	return 0

	n = upvotes + downvotes
	z = 1.64485 # 90% confidence z-score
	phat = float(upvotes) / n # p-hat

	return ((phat + z * z / (2 * n) - z *
	math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n))
	/ (1 + z * z / n))


	def time_dependent(decay_seconds, upvotes_name, downvotes_name, created_name,
	score):
	"""Ranking based on both age and quality.

	This is the algorithm Reddit uses to sort stories. We want there to be
	churn, a constant stream of new stories hitting the top scratchpads page,
	so this algorithm takes into account both the score of the scratchpad as
	well as the age.

	See http://amix.dk/blog/post/19588
	"""
	s = getattr(score, upvotes_name) - getattr(score, downvotes_name)

	# Weight votes logarithmically - initial votes are worth a ton
	order = math.log(max(abs(s), 1), 10)

	# signum
	sign = 1 if s > 0 else -1 if s < 0 else 0

	# Seconds since this algorithm's start date (when I was writing this
	# code)
	date = getattr(score, created_name) or datetime.datetime.now()
	seconds = epoch_seconds(date) - 1349108492

	return round(order + sign * seconds / decay_seconds, 7)


	class TimeIndependentScoreProperty(ndb.ComputedProperty):
	"""A score of the value of an entity, based solely on votes, not on age.

	This uses the same algorithm as Reddit uses for comments - designed to
	promote the highest quality to the top independent of age. It estimates the
	"real" value of an entity, with 90% confidence, based on the current votes.

	upvotes_name: The name of the count of upvotes on your model.

	downvotes_name: The name of the count of downvotes on your model.
	"""
	def __init__(self, upvotes_name="upvotes", downvotes_name="downvotes",
	**kwargs):
	super(TimeIndependentScoreProperty, self).__init__(
	functools.partial(wilson_confidence, upvotes_name, downvotes_name),
	**kwargs)


	class TimeDependentScoreProperty(ndb.ComputedProperty):
	"""A score of the "hotness" of an entity.

	Think of the frontpage of HN or Reddit - in fact it uses the same algorithm
	as Reddit uses for submissions. Use this when you are looking for churn - a
	constant stream of content balancing newness and quality.

	upvotes_name: The name of the count of upvotes on your model.

	downvotes_name: The name of the count of downvotes on your model.

	created_name: The name of the creation datetime on your model.
	"""
	def __init__(self, decay_seconds, upvotes_name="upvotes",
	downvotes_name="downvotes", created_name="created", **kwargs):
	super(TimeDependentScoreProperty, self).__init__(functools.partial(
	time_dependent, decay_seconds, upvotes_name, downvotes_name,
	created_name), **kwargs)