Created
December 9, 2021 04:21
-
-
Save spider-man-tm/96125f23b301fb53ceeebd8c7f399556 to your computer and use it in GitHub Desktop.
日本語テキストデータの鉄板前処理と形態素解析
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import MeCab | |
import neologdn | |
def normalize_text(text: str) -> str: | |
""" | |
neologdnによる文章正規化及び正規表現によるURL除去 | |
1. スイーツを食べに行ったよん♪ ちょーーーーーーーおいしかった💌 https://npb-visualization.com/ | |
2. (正規化後) スイーツを食べに行ったよん♪ちょーおいしかった💌https://npb-visualization.com/ | |
3. (URL除去後) スイーツを食べに行ったよん♪ちょーおいしかった💌 | |
""" | |
text = neologdn.normalize(text) | |
text = re.sub( | |
r'(http|https)://([-\w]+\.)+[-\w]+(/[-\w./?%&=]*)?', | |
'', text) | |
return text | |
def tokenize(text: str) -> list[str]: | |
""" | |
MeCabによる形態素解析後、指定の品詞のみチョイス | |
""" | |
pick_up = {'一般', '自立', '固有名詞', '形容動詞語幹', 'サ変接続', '助詞類接続'} | |
words = [] | |
mecab = MeCab.Tagger() | |
text = normalize_text(text) | |
text_parsed = mecab.parse(text) | |
text_parsed = text_parsed.split('\n') | |
for token in text_parsed: | |
features = token.split('\t') | |
if len(features) == 2: | |
info = features[1].split(',') | |
if info[1] in pick_up and info[6] not in {'*', '・', '…', '()'}: | |
words.append(info[6]) | |
return words |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment