Skip to content

Instantly share code, notes, and snippets.

@LemonPrefect
Last active January 8, 2023 17:40
Show Gist options
  • Save LemonPrefect/1fa8debf53daca8b5f3b586150f7ce51 to your computer and use it in GitHub Desktop.
Save LemonPrefect/1fa8debf53daca8b5f3b586150f7ce51 to your computer and use it in GitHub Desktop.
Builder using jieba for Chinese implementation of tokenization for Lunr.py.
from collections import defaultdict
from lunr.field_ref import FieldRef
from lunr.builder import Builder
from lunr.token import Token
import jieba
"""
Jieba Builder
LemonPrefect<me@lemonprefect.cn>
Builder using jieba for Chinese implementation of tokenization for Lunr.py.
"""
class JiebaBuilder(Builder):
def __init__(self):
super().__init__()
def add(self, doc, attributes=None):
doc_ref = str(doc[self._ref])
self._documents[doc_ref] = attributes or {}
self.document_count += 1
for field_name, field in self._fields.items():
extractor = field.extractor
field_value = doc[field_name] if extractor is None else extractor(doc)
tokens = JiebaTokenizer(field_value)
terms = self.pipeline.run(tokens, field_name)
field_ref = FieldRef(doc_ref, field_name)
field_terms = defaultdict(int)
# TODO: field_refs are casted to strings in JS, should we allow
# FieldRef as keys?
self.field_term_frequencies[str(field_ref)] = field_terms
self.field_lengths[str(field_ref)] = len(terms)
for term in terms:
# TODO: term is a Token, should we allow Tokens as keys?
term_key = str(term)
field_terms[term_key] += 1
if term_key not in self.inverted_index:
posting = {_field_name: {} for _field_name in self._fields}
posting["_index"] = self.term_index
self.term_index += 1
self.inverted_index[term_key] = posting
if doc_ref not in self.inverted_index[term_key][field_name]:
self.inverted_index[term_key][field_name][doc_ref] = defaultdict(
list
)
for metadata_key in self.metadata_whitelist:
metadata = term.metadata[metadata_key]
self.inverted_index[term_key][field_name][doc_ref][
metadata_key
].append(metadata)
def JiebaTokenizer(obj):
if isinstance(obj, (list, tuple)):
obj = " ".join(obj)
tokens = list(filter(lambda x: str(x) != " ", [Token(token[0], {
"position": [token[1], len(token[0])],
"index": index
}) for index, token in enumerate(jieba.tokenize(obj, mode="search"))]))
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment