Created
June 10, 2022 19:38
-
-
Save quiiver/4699f09e2661d40685e5452e22aee81c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from math import log2, log | |
from typing import Any, Dict | |
# Code copied from here | |
# https://github.com/wikimedia/mediawiki-extensions-CirrusSearch/blob/85572945a8f97829ac9c9a2c95bb97a40c24ac73/includes/BuildDocument/Completion/QualityScore.php | |
class Scorer(): | |
INCOMING_LINKS_MAX_DOCS_FACTOR = 0.1 | |
EXTERNAL_LINKS_NORM = 20 | |
PAGE_SIZE_NORM = 50000 | |
HEADING_NORM = 20 | |
REDIRECT_NORM = 30 | |
INCOMING_LINKS_WEIGHT = 0.6 | |
EXTERNAL_LINKS_WEIGHT = 0.1 | |
PAGE_SIZE_WEIGHT = 0.1 | |
HEADING_WEIGHT = 0.2 | |
REDIRECT_WEIGHT = 0.1 | |
QSCORE_WEIGHT = 1 | |
# 0.04% of the total page views is the max we accept | |
POPULARITY_WEIGHT = 0.4 | |
POPULARITY_MAX = 0.0004 | |
SCORE_RANGE = 10_000_000 | |
max_docs: int | |
incoming_links_norm: int | |
def __init__(self, max_docs: int) -> None: | |
self.max_docs = max_docs | |
self.incoming_links_norm = int(self.max_docs * self.INCOMING_LINKS_MAX_DOCS_FACTOR) | |
def score(self, doc: Dict) -> float: | |
incoming_links = self._score_norm_log2( | |
doc.get('incoming_links', 0), | |
self.incoming_links_norm) | |
page_size = self._score_norm_log2( | |
doc.get('text_bytes', 0), | |
self.PAGE_SIZE_NORM ) | |
external_links = self._score_norm( | |
len(doc.get('external_links', [])), | |
self.EXTERNAL_LINKS_NORM ) | |
heading = self._score_norm( | |
len(doc.get('heading', [])), | |
self.HEADING_NORM ) | |
redirect = self._score_norm( | |
len(doc.get('redirect', [])), | |
self.REDIRECT_NORM ) | |
score = incoming_links * self.INCOMING_LINKS_WEIGHT; | |
score += external_links * self.EXTERNAL_LINKS_WEIGHT; | |
score += page_size * self.PAGE_SIZE_WEIGHT; | |
score += heading * self.HEADING_WEIGHT; | |
score += redirect * self.REDIRECT_WEIGHT; | |
# We have a standardized composite score between 0 and 1 | |
score = score / (self.INCOMING_LINKS_WEIGHT + self.EXTERNAL_LINKS_WEIGHT + | |
self.PAGE_SIZE_WEIGHT + self.HEADING_WEIGHT + self.REDIRECT_WEIGHT) * self.QSCORE_WEIGHT | |
popularity = doc.get('popularity_score', 0) | |
if popularity > self.POPULARITY_MAX: | |
popularity = 1 | |
else: | |
log_base = 1 + self.POPULARITY_MAX * self.max_docs | |
if log_base> 1: | |
popularity = log( 1 + ( popularity * self.max_docs ), log_base ) | |
else: | |
popularity = 0 | |
score += popularity * self.POPULARITY_WEIGHT | |
score /= self.QSCORE_WEIGHT + self.POPULARITY_WEIGHT | |
return int(score * self.SCORE_RANGE) | |
def _score_norm_log2(self, value: int, norm: int) -> float: | |
return log2(2 if value < norm else (value / norm) + 1) | |
def _score_norm(self, value: int, norm: int) -> float: | |
return log(1 if value < norm else value / norm) | |
class Builder(): | |
scorer: Scorer | |
batch_id: int | |
def __init__(self, max_docs: int = 6_500_000) -> None: | |
self.batch_id = int(time.time_ns()/1000) | |
self.scorer = Scorer(max_docs) | |
def build(self, id: str, doc: Dict[str, Any]): | |
title = doc.get("title", "") | |
similars = [] # TODO | |
inputs = [title] | |
inputs.extend(similars) | |
score = self.scorer.score(doc) | |
return { | |
"batch_id": self.batch_id, | |
"doc_id": id, | |
"title": title, | |
"suggest": { | |
"input": inputs, | |
"weight": score, | |
}, | |
"suggest-stop": { | |
"input": inputs, | |
"weight": score, | |
}, | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment