iRaM-sAgOr/japanese_tokenizer_rasa_1.6.0.py

## japanese_tokenizer_rasa_1.6.0.py
import re
from typing import Any, Dict, List, Text

from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.nlu.training_data import Message, TrainingData
from rasa.nlu.constants import (
    INTENT_ATTRIBUTE,
    TEXT_ATTRIBUTE,
    TOKENS_NAMES,
    MESSAGE_ATTRIBUTES,
)

class SudachiTokenizer(Tokenizer):

    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]

    defaults = {
        # Flag to check whether to split intents
        "intent_tokenization_flag": False,
        # Symbol on which intent should be split
        "intent_split_symbol": "_",
        # add __CLS__ token to the end of the list of tokens
        "use_cls_token": False,
    }  # default don't load custom dictionary

    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
        super().__init__(component_config)

        from sudachipy import dictionary
        from sudachipy import tokenizer

        self.tokenizer_obj = dictionary.Dictionary().create()
        self.mode = tokenizer.Tokenizer.SplitMode.A

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["sudachipy"]

    def train(
        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
    ) -> None:

        for example in training_data.training_examples:
            for attribute in MESSAGE_ATTRIBUTES:
                if example.get(attribute) is not None:
                    example.set(
                        TOKENS_NAMES[attribute],
                        self.tokenize(example.get(attribute), attribute),
                    )
        #for example in training_data.training_examples:
            #example.set("tokens", self.tokenize(self, example.text))

    def process(self, message: Message, **kwargs: Any) -> None:
        message.set(
            TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text, TEXT_ATTRIBUTE)
        )
        #message.set("tokens", self.tokenize(self, message.text))

    #def tokenize(self, text: Text) -> List[Token]:
    def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
        words = [m.surface() for m in self.tokenizer_obj.tokenize(text, self.mode)]
        running_offset = 0
        tokens = []
        print(words)
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))

        self.add_cls_token(tokens, attribute)

        return tokens
	import re
	from typing import Any, Dict, List, Text

	from rasa.nlu.config import RasaNLUModelConfig
	from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
	from rasa.nlu.training_data import Message, TrainingData
	from rasa.nlu.constants import (
	INTENT_ATTRIBUTE,
	TEXT_ATTRIBUTE,
	TOKENS_NAMES,
	MESSAGE_ATTRIBUTES,
	)

	class SudachiTokenizer(Tokenizer):

	provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]

	defaults = {
	# Flag to check whether to split intents
	"intent_tokenization_flag": False,
	# Symbol on which intent should be split
	"intent_split_symbol": "_",
	# add __CLS__ token to the end of the list of tokens
	"use_cls_token": False,
	} # default don't load custom dictionary

	def __init__(self, component_config: Dict[Text, Any] = None) -> None:
	super().__init__(component_config)

	from sudachipy import dictionary
	from sudachipy import tokenizer

	self.tokenizer_obj = dictionary.Dictionary().create()
	self.mode = tokenizer.Tokenizer.SplitMode.A

	@classmethod
	def required_packages(cls) -> List[Text]:
	return ["sudachipy"]

	def train(
	self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
	) -> None:

	for example in training_data.training_examples:
	for attribute in MESSAGE_ATTRIBUTES:
	if example.get(attribute) is not None:
	example.set(
	TOKENS_NAMES[attribute],
	self.tokenize(example.get(attribute), attribute),
	)
	#for example in training_data.training_examples:
	#example.set("tokens", self.tokenize(self, example.text))

	def process(self, message: Message, **kwargs: Any) -> None:
	message.set(
	TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text, TEXT_ATTRIBUTE)
	)
	#message.set("tokens", self.tokenize(self, message.text))

	#def tokenize(self, text: Text) -> List[Token]:
	def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
	words = [m.surface() for m in self.tokenizer_obj.tokenize(text, self.mode)]
	running_offset = 0
	tokens = []
	print(words)
	for word in words:
	word_offset = text.index(word, running_offset)
	word_len = len(word)
	running_offset = word_offset + word_len
	tokens.append(Token(word, word_offset))

	self.add_cls_token(tokens, attribute)

	return tokens