Skip to content

Instantly share code, notes, and snippets.

@pamelafox
Last active May 6, 2018 05:15
Show Gist options
  • Save pamelafox/417bc6265e411a24f4e3 to your computer and use it in GitHub Desktop.
Save pamelafox/417bc6265e411a24f4e3 to your computer and use it in GitHub Desktop.
Hotlist algorithm
"""Scores for any type of entity.
This is used to rank entities, for example the list of top forks of a
scratchpad and the current "hottest" scratchpads.
"""
import math
import datetime
import functools
from google.appengine.ext import ndb
__all__ = ["TimeIndependentScoreProperty", "TimeDependentScoreProperty"]
def epoch_seconds(date):
"""Returns the number of seconds from the epoch to date."""
epoch = datetime.datetime(1970, 1, 1)
return (date - epoch).total_seconds()
def wilson_confidence(upvotes_name, downvotes_name, score):
"""Lower bound of Wilson score 90% confidence interval.
This is the algorithm Reddit uses to sort comments.
You should not use this if downvotes are disallowed - it is only useful in
the presence of both upvotes and downvotes because its ranking is based on
an estimate of the ratio of upvotes to downvotes.
See http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
"""
upvotes = getattr(score, upvotes_name)
downvotes = getattr(score, downvotes_name)
if upvotes == 0:
return -downvotes
elif upvotes == downvotes:
return 0
n = upvotes + downvotes
z = 1.64485 # 90% confidence z-score
phat = float(upvotes) / n # p-hat
return ((phat + z * z / (2 * n) - z *
math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n))
/ (1 + z * z / n))
def time_dependent(decay_seconds, upvotes_name, downvotes_name, created_name,
score):
"""Ranking based on both age and quality.
This is the algorithm Reddit uses to sort stories. We want there to be
churn, a constant stream of new stories hitting the top scratchpads page,
so this algorithm takes into account both the score of the scratchpad as
well as the age.
See http://amix.dk/blog/post/19588
"""
s = getattr(score, upvotes_name) - getattr(score, downvotes_name)
# Weight votes logarithmically - initial votes are worth a ton
order = math.log(max(abs(s), 1), 10)
# signum
sign = 1 if s > 0 else -1 if s < 0 else 0
# Seconds since this algorithm's start date (when I was writing this
# code)
date = getattr(score, created_name) or datetime.datetime.now()
seconds = epoch_seconds(date) - 1349108492
return round(order + sign * seconds / decay_seconds, 7)
class TimeIndependentScoreProperty(ndb.ComputedProperty):
"""A score of the value of an entity, based solely on votes, not on age.
This uses the same algorithm as Reddit uses for comments - designed to
promote the highest quality to the top independent of age. It estimates the
"real" value of an entity, with 90% confidence, based on the current votes.
upvotes_name: The name of the count of upvotes on your model.
downvotes_name: The name of the count of downvotes on your model.
"""
def __init__(self, upvotes_name="upvotes", downvotes_name="downvotes",
**kwargs):
super(TimeIndependentScoreProperty, self).__init__(
functools.partial(wilson_confidence, upvotes_name, downvotes_name),
**kwargs)
class TimeDependentScoreProperty(ndb.ComputedProperty):
"""A score of the "hotness" of an entity.
Think of the frontpage of HN or Reddit - in fact it uses the same algorithm
as Reddit uses for submissions. Use this when you are looking for churn - a
constant stream of content balancing newness and quality.
upvotes_name: The name of the count of upvotes on your model.
downvotes_name: The name of the count of downvotes on your model.
created_name: The name of the creation datetime on your model.
"""
def __init__(self, decay_seconds, upvotes_name="upvotes",
downvotes_name="downvotes", created_name="created", **kwargs):
super(TimeDependentScoreProperty, self).__init__(functools.partial(
time_dependent, decay_seconds, upvotes_name, downvotes_name,
created_name), **kwargs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment