Skip to content

Instantly share code, notes, and snippets.

@ikuyamada
Last active June 28, 2021 07:53
Show Gist options
  • Save ikuyamada/1bf17a00a9afadb72327907f93c4c469 to your computer and use it in GitHub Desktop.
Save ikuyamada/1bf17a00a9afadb72327907f93c4c469 to your computer and use it in GitHub Desktop.
AllenNLPのMeCabトークナイザ
from allennlp.data.tokenizers.token_class import Token
from allennlp.data.tokenizers.tokenizer import Tokenizer
from fugashi import Tagger
@Tokenizer.register("mecab")
class MecabTokenizer(Tokenizer):
def __init__(self):
# Taggerインスタンスを作成
self._tagger = Tagger()
def tokenize(self, text):
"""入力テキストをMeCabを用いて解析する"""
tokens = []
# 入力テキストを単語に分割
for word in self._tagger(text):
# 単語のテキスト(word.surface)と品詞(word.feature.pos1)からTokenインスタンスを作成
token = Token(text=word.surface, pos_=word.feature.pos1)
tokens.append(token)
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment