Skip to content

Instantly share code, notes, and snippets.

@spider-man-tm
Created December 9, 2021 04:21
Show Gist options
  • Save spider-man-tm/96125f23b301fb53ceeebd8c7f399556 to your computer and use it in GitHub Desktop.
Save spider-man-tm/96125f23b301fb53ceeebd8c7f399556 to your computer and use it in GitHub Desktop.
日本語テキストデータの鉄板前処理と形態素解析
import re
import MeCab
import neologdn
def normalize_text(text: str) -> str:
"""
neologdnによる文章正規化及び正規表現によるURL除去
1. スイーツを食べに行ったよん♪        ちょーーーーーーーおいしかった💌 https://npb-visualization.com/
2. (正規化後) スイーツを食べに行ったよん♪ちょーおいしかった💌https://npb-visualization.com/
3. (URL除去後) スイーツを食べに行ったよん♪ちょーおいしかった💌
"""
text = neologdn.normalize(text)
text = re.sub(
r'(http|https)://([-\w]+\.)+[-\w]+(/[-\w./?%&=]*)?',
'', text)
return text
def tokenize(text: str) -> list[str]:
"""
MeCabによる形態素解析後、指定の品詞のみチョイス
"""
pick_up = {'一般', '自立', '固有名詞', '形容動詞語幹', 'サ変接続', '助詞類接続'}
words = []
mecab = MeCab.Tagger()
text = normalize_text(text)
text_parsed = mecab.parse(text)
text_parsed = text_parsed.split('\n')
for token in text_parsed:
features = token.split('\t')
if len(features) == 2:
info = features[1].split(',')
if info[1] in pick_up and info[6] not in {'*', '・', '…', '()'}:
words.append(info[6])
return words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment