Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save iRaM-sAgOr/35d14f37cd38025715746646f4928960 to your computer and use it in GitHub Desktop.
Save iRaM-sAgOr/35d14f37cd38025715746646f4928960 to your computer and use it in GitHub Desktop.
import re
from typing import Any, Dict, List, Text
from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.nlu.training_data import Message, TrainingData
from rasa.nlu.constants import (
INTENT_ATTRIBUTE,
TEXT_ATTRIBUTE,
TOKENS_NAMES,
MESSAGE_ATTRIBUTES,
)
class SudachiTokenizer(Tokenizer):
provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
defaults = {
# Flag to check whether to split intents
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
# add __CLS__ token to the end of the list of tokens
"use_cls_token": False,
} # default don't load custom dictionary
def __init__(self, component_config: Dict[Text, Any] = None) -> None:
super().__init__(component_config)
from sudachipy import dictionary
from sudachipy import tokenizer
self.tokenizer_obj = dictionary.Dictionary().create()
self.mode = tokenizer.Tokenizer.SplitMode.A
@classmethod
def required_packages(cls) -> List[Text]:
return ["sudachipy"]
def train(
self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
) -> None:
for example in training_data.training_examples:
for attribute in MESSAGE_ATTRIBUTES:
if example.get(attribute) is not None:
example.set(
TOKENS_NAMES[attribute],
self.tokenize(example.get(attribute), attribute),
)
#for example in training_data.training_examples:
#example.set("tokens", self.tokenize(self, example.text))
def process(self, message: Message, **kwargs: Any) -> None:
message.set(
TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text, TEXT_ATTRIBUTE)
)
#message.set("tokens", self.tokenize(self, message.text))
#def tokenize(self, text: Text) -> List[Token]:
def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
words = [m.surface() for m in self.tokenizer_obj.tokenize(text, self.mode)]
running_offset = 0
tokens = []
print(words)
for word in words:
word_offset = text.index(word, running_offset)
word_len = len(word)
running_offset = word_offset + word_len
tokens.append(Token(word, word_offset))
self.add_cls_token(tokens, attribute)
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment