Last active
July 16, 2017 01:24
-
-
Save shigeomi-takada/2da0bf2f517e735b5eeb57d16a050927 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import os | |
import MeCab | |
import unicodedata | |
class Wakati(): | |
''' | |
MeCabを使った分かち書き | |
固有名詞等の辞書には mecab-ipadic-neologd を採用している | |
文字列の正規化は下記ページを参照 | |
https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja | |
''' | |
def _unicode_normalize(self, cls, s): | |
pt = re.compile('([{}]+)'.format(cls)) | |
def norm(c): | |
return unicodedata.normalize('NFKC', c) if pt.match(c) else c | |
s = ''.join(norm(x) for x in re.split(pt, s)) | |
s = re.sub('-', '-', s) | |
return s | |
def _remove_extra_spaces(self, s): | |
s = re.sub('[ ]+', ' ', s) | |
blocks = ''.join(('\u4E00-\u9FFF', # CJK UNIFIED IDEOGRAPHS | |
'\u3040-\u309F', # HIRAGANA | |
'\u30A0-\u30FF', # KATAKANA | |
'\u3000-\u303F', # CJK SYMBOLS AND PUNCTUATION | |
'\uFF00-\uFFEF' # HALFWIDTH AND FULLWIDTH FORMS | |
)) | |
basic_latin = '\u0000-\u007F' | |
def remove_space_between(cls1, cls2, s): | |
p = re.compile('([{}]) ([{}])'.format(cls1, cls2)) | |
while p.search(s): | |
s = p.sub(r'\1\2', s) | |
return s | |
s = remove_space_between(blocks, blocks, s) | |
s = remove_space_between(blocks, basic_latin, s) | |
s = remove_space_between(basic_latin, blocks, s) | |
return s | |
def _normalize_neologd(self, s): | |
s = s.strip() | |
s = self._unicode_normalize('0-9A-Za-z。-゚', s) | |
def maketrans(f, t): | |
return {ord(x): ord(y) for x, y in zip(f, t)} | |
s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s) # normalize hyphens | |
s = re.sub('[﹣-ー—―─━ー]+', 'ー', s) # normalize choonpus | |
s = re.sub('[~∼∾〜〰~]', '', s) # remove tildes | |
s = s.translate( | |
maketrans( | |
'!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~。、・「」', | |
'!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}〜。、・「」' | |
) | |
) | |
s = self._remove_extra_spaces(s) | |
s = self._unicode_normalize('!”#$%&’()*+,-./:;<>?@[¥]^_`{|}〜', s) # keep =,・,「,」 | |
s = re.sub('[’]', '\'', s) | |
s = re.sub('[”]', '"', s) | |
return s | |
def parse(self, doc): | |
''' | |
日本語を分かち書きにするためのメソッド | |
動詞と名詞の基本形だけを抜き出す | |
@param string doc | |
@return string 分かち書きにしたdocumentを返す | |
''' | |
if not doc: | |
return '' | |
# -Ochasenを指定するとtabで区切られる。こんな感じ。 | |
# ['C言語\tシーゲンゴ\tC言語\t名詞-固有名詞-一般\t\t'] | |
neologd_path = os.path.expanduser('~') + '/mecab-ipadic-neologd/mecab-ipadic-neologd' | |
tagger = MeCab.Tagger('-Ochasen -d %s' % neologd_path) | |
# 正規化した上で分形態素解析して、1行ごとに区切ってリスト化する | |
words = tagger.parse(self._normalize_neologd(doc)).split('\n') | |
val = [] | |
for word in words: | |
# EOS, ''の場合は無視 | |
if word == 'EOS' or word == '': | |
continue | |
# タブで区切り、リスト化 | |
word_info = word.split('\t') | |
# 4番目に何詞かが格納されている。動詞と名詞だけを対象にする。 | |
if word_info[3][0:2] in ['動詞', '名詞']: | |
# 3番目に基本形が格納されている | |
val.append(word_info[2]) | |
# ホワイトスペースでつなげて1つの文字列にする | |
return ' '.join(val) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment