Last active
May 6, 2018 05:15
-
-
Save pamelafox/417bc6265e411a24f4e3 to your computer and use it in GitHub Desktop.
Hotlist algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Scores for any type of entity. | |
This is used to rank entities, for example the list of top forks of a | |
scratchpad and the current "hottest" scratchpads. | |
""" | |
import math | |
import datetime | |
import functools | |
from google.appengine.ext import ndb | |
__all__ = ["TimeIndependentScoreProperty", "TimeDependentScoreProperty"] | |
def epoch_seconds(date): | |
"""Returns the number of seconds from the epoch to date.""" | |
epoch = datetime.datetime(1970, 1, 1) | |
return (date - epoch).total_seconds() | |
def wilson_confidence(upvotes_name, downvotes_name, score): | |
"""Lower bound of Wilson score 90% confidence interval. | |
This is the algorithm Reddit uses to sort comments. | |
You should not use this if downvotes are disallowed - it is only useful in | |
the presence of both upvotes and downvotes because its ranking is based on | |
an estimate of the ratio of upvotes to downvotes. | |
See http://www.evanmiller.org/how-not-to-sort-by-average-rating.html | |
""" | |
upvotes = getattr(score, upvotes_name) | |
downvotes = getattr(score, downvotes_name) | |
if upvotes == 0: | |
return -downvotes | |
elif upvotes == downvotes: | |
return 0 | |
n = upvotes + downvotes | |
z = 1.64485 # 90% confidence z-score | |
phat = float(upvotes) / n # p-hat | |
return ((phat + z * z / (2 * n) - z * | |
math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) | |
/ (1 + z * z / n)) | |
def time_dependent(decay_seconds, upvotes_name, downvotes_name, created_name, | |
score): | |
"""Ranking based on both age and quality. | |
This is the algorithm Reddit uses to sort stories. We want there to be | |
churn, a constant stream of new stories hitting the top scratchpads page, | |
so this algorithm takes into account both the score of the scratchpad as | |
well as the age. | |
See http://amix.dk/blog/post/19588 | |
""" | |
s = getattr(score, upvotes_name) - getattr(score, downvotes_name) | |
# Weight votes logarithmically - initial votes are worth a ton | |
order = math.log(max(abs(s), 1), 10) | |
# signum | |
sign = 1 if s > 0 else -1 if s < 0 else 0 | |
# Seconds since this algorithm's start date (when I was writing this | |
# code) | |
date = getattr(score, created_name) or datetime.datetime.now() | |
seconds = epoch_seconds(date) - 1349108492 | |
return round(order + sign * seconds / decay_seconds, 7) | |
class TimeIndependentScoreProperty(ndb.ComputedProperty): | |
"""A score of the value of an entity, based solely on votes, not on age. | |
This uses the same algorithm as Reddit uses for comments - designed to | |
promote the highest quality to the top independent of age. It estimates the | |
"real" value of an entity, with 90% confidence, based on the current votes. | |
upvotes_name: The name of the count of upvotes on your model. | |
downvotes_name: The name of the count of downvotes on your model. | |
""" | |
def __init__(self, upvotes_name="upvotes", downvotes_name="downvotes", | |
**kwargs): | |
super(TimeIndependentScoreProperty, self).__init__( | |
functools.partial(wilson_confidence, upvotes_name, downvotes_name), | |
**kwargs) | |
class TimeDependentScoreProperty(ndb.ComputedProperty): | |
"""A score of the "hotness" of an entity. | |
Think of the frontpage of HN or Reddit - in fact it uses the same algorithm | |
as Reddit uses for submissions. Use this when you are looking for churn - a | |
constant stream of content balancing newness and quality. | |
upvotes_name: The name of the count of upvotes on your model. | |
downvotes_name: The name of the count of downvotes on your model. | |
created_name: The name of the creation datetime on your model. | |
""" | |
def __init__(self, decay_seconds, upvotes_name="upvotes", | |
downvotes_name="downvotes", created_name="created", **kwargs): | |
super(TimeDependentScoreProperty, self).__init__(functools.partial( | |
time_dependent, decay_seconds, upvotes_name, downvotes_name, | |
created_name), **kwargs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment