Created
May 1, 2019 00:45
-
-
Save nogawanogawa/6e4fedb6a83652f63606475109ba1aa1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import sudachipy | |
from sudachipy import tokenizer, dictionary | |
class SudachiAnalizer(): | |
def get_token(self, source) : | |
with open(sudachipy.config.SETTINGFILE, "r", encoding="utf-8") as f: | |
settings = json.load(f) | |
tokenizer_obj = dictionary.Dictionary(settings).create() | |
mode = tokenizer.Tokenizer.SplitMode.C | |
result = [m.surface() for m in tokenizer_obj.tokenize(mode,source)] | |
word_list = [] | |
for mrph in result: | |
if not (mrph == ""): | |
norm_word = tokenizer_obj.tokenize(mode,mrph)[0].normalized_form() | |
hinsi = tokenizer_obj.tokenize(mode,norm_word)[0].part_of_speech()[0] | |
# 単語の正規表現が特定の品詞の場合のみ採用する | |
if hinsi in ["名詞", "動詞", "形容詞"]: | |
word = tokenizer_obj.tokenize(mode,norm_word)[0].dictionary_form() | |
word_list.append(word) | |
return word_list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment