Skip to content

Instantly share code, notes, and snippets.

@psorianom
Last active April 20, 2020 09:53
Show Gist options
  • Save psorianom/32d54b743f3e3bf9c7ee6417ef8b042e to your computer and use it in GitHub Desktop.
Save psorianom/32d54b743f3e3bf9c7ee6417ef8b042e to your computer and use it in GitHub Desktop.
Moses tokenizer with spans. Built upon the Python's sacremoses port.
"""
Class that inherits MosesTokenizer and adds a method which returns the spans. Kinda flaky with the escape, unescape,
detokenize situation, so watch out!
"""
from sacremoses import MosesTokenizer, MosesDetokenizer
class MosesTokenizerSpans(MosesTokenizer):
def __init__(self, lang="en", custom_nonbreaking_prefixes_file=None):
MosesTokenizer.__init__(self, lang=lang,
custom_nonbreaking_prefixes_file=custom_nonbreaking_prefixes_file)
self.lang = lang
def span_tokenize(
self,
text,
aggressive_dash_splits=False,
escape=True,
protected_patterns=None,
):
# https://stackoverflow.com/a/35634472
import re
detokenizer = MosesDetokenizer(lang=self.lang)
tokens = self.tokenize(text=text, aggressive_dash_splits=aggressive_dash_splits,
return_str=False, escape=escape,
protected_patterns=protected_patterns)
tail = text
accum = 0
tokens_spans = []
for token in tokens:
detokenized_token = detokenizer.detokenize(tokens=[token],
return_str=True,
unescape=escape)
escaped_token = re.escape(detokenized_token)
m = re.search(escaped_token, tail)
tok_start_pos, tok_end_pos = m.span()
sent_start_pos = accum + tok_start_pos
sent_end_pos = accum + tok_end_pos
accum += tok_end_pos
tail = tail[tok_end_pos:]
tokens_spans.append((detokenized_token, (sent_start_pos, sent_end_pos)))
return tokens_spans
# moses_tokenizer = MosesTokenizerSpans(lang="fr")
# print(moses_tokenizer.span_tokenize("Le chat noir."))
# Out[1]: [('Le', (0, 2)), ('chat', (3, 7)), ('noir', (8, 12)), ('.', (12, 13))]
def build_moses_tokenizer(tokenizer: MosesTokenizerSpans,
normalizer: MosesPunctNormalizer = None) -> Callable[[str], List[Token]]:
"""
Wrap a sacremoses tokenizer (with spans) to build a tokenizer for the Sentence class.
:param a MosesTokenizerSpans tokenizer
:return a tokenizer function to provide to Sentence class constructor
"""
try:
from sacremoses import MosesTokenizer
from sacremoses import MosesPunctNormalizer
except ImportError:
raise ImportError(
"Please install sacremoses or better before using the moses tokenizer, otherwise you can use segtok_tokenizer as advanced tokenizer."
)
moses_tokenizer: MosesTokenizerSpans = tokenizer
if normalizer:
normalizer: MosesPunctNormalizer = normalizer
def tokenizer(text: str) -> List[Token]:
if normalizer:
text = normalizer.normalize(text=text)
doc = moses_tokenizer.span_tokenize(text=text, escape=False)
previous_token = None
tokens: List[Token] = []
for word, (start_pos, end_pos) in doc:
word: str = word
token = Token(
text=word, start_position=start_pos, whitespace_after=True
)
tokens.append(token)
if (previous_token is not None) and (
token.start_pos - 1
== previous_token.start_pos + len(previous_token.text)
):
previous_token.whitespace_after = False
previous_token = token
return tokens
return tokenizer
def flair_moses_tokenizer():
moses_tokenizer = MosesTokenizerSpans(lang="fr")
moses_tokenizer = build_moses_tokenizer(tokenizer=moses_tokenizer)
return moses_tokenizer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment