Skip to content

Instantly share code, notes, and snippets.

@wfng92
Last active March 19, 2020 07:48
Show Gist options
  • Save wfng92/831b47df29de687c8ea3264ffb9134ee to your computer and use it in GitHub Desktop.
Save wfng92/831b47df29de687c8ea3264ffb9134ee to your computer and use it in GitHub Desktop.
import re
from typing import Any, Dict, List, Text
from rasa.nlu.components import Component
from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.tokenizers import Token, Tokenizer
from rasa.nlu.training_data import Message, TrainingData
class JapaneseTokenizer(Tokenizer, Component):
provides = ["tokens"]
def __init__(self, component_config: Dict[Text, Any] = None) -> None:
super(JapaneseTokenizer, self).__init__(component_config)
from sudachipy import dictionary
from sudachipy import tokenizer
self.tokenizer_obj = dictionary.Dictionary().create()
self.mode = tokenizer.Tokenizer.SplitMode.A
@classmethod
def required_packages(cls) -> List[Text]:
return ["sudachipy"]
def train(
self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
) -> None:
for example in training_data.training_examples:
example.set("tokens", self.tokenize(self, example.text))
def process(self, message: Message, **kwargs: Any) -> None:
message.set("tokens", self.tokenize(self, message.text))
@staticmethod
def tokenize(self, text: Text) -> List[Token]:
words = [m.surface() for m in self.tokenizer_obj.tokenize(text, self.mode)]
running_offset = 0
tokens = []
for word in words:
word_offset = text.index(word, running_offset)
word_len = len(word)
running_offset = word_offset + word_len
tokens.append(Token(word, word_offset))
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment