Skip to content

Instantly share code, notes, and snippets.

@shigeomi-takada
Last active July 16, 2017 01:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shigeomi-takada/2da0bf2f517e735b5eeb57d16a050927 to your computer and use it in GitHub Desktop.
Save shigeomi-takada/2da0bf2f517e735b5eeb57d16a050927 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import re
import os
import MeCab
import unicodedata
class Wakati():
'''
MeCabを使った分かち書き
固有名詞等の辞書には mecab-ipadic-neologd を採用している
文字列の正規化は下記ページを参照
https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
'''
def _unicode_normalize(self, cls, s):
pt = re.compile('([{}]+)'.format(cls))
def norm(c):
return unicodedata.normalize('NFKC', c) if pt.match(c) else c
s = ''.join(norm(x) for x in re.split(pt, s))
s = re.sub('-', '-', s)
return s
def _remove_extra_spaces(self, s):
s = re.sub('[  ]+', ' ', s)
blocks = ''.join(('\u4E00-\u9FFF', # CJK UNIFIED IDEOGRAPHS
'\u3040-\u309F', # HIRAGANA
'\u30A0-\u30FF', # KATAKANA
'\u3000-\u303F', # CJK SYMBOLS AND PUNCTUATION
'\uFF00-\uFFEF' # HALFWIDTH AND FULLWIDTH FORMS
))
basic_latin = '\u0000-\u007F'
def remove_space_between(cls1, cls2, s):
p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
while p.search(s):
s = p.sub(r'\1\2', s)
return s
s = remove_space_between(blocks, blocks, s)
s = remove_space_between(blocks, basic_latin, s)
s = remove_space_between(basic_latin, blocks, s)
return s
def _normalize_neologd(self, s):
s = s.strip()
s = self._unicode_normalize('0-9A-Za-z。-゚', s)
def maketrans(f, t):
return {ord(x): ord(y) for x, y in zip(f, t)}
s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s) # normalize hyphens
s = re.sub('[﹣-ー—―─━ー]+', 'ー', s) # normalize choonpus
s = re.sub('[~∼∾〜〰~]', '', s) # remove tildes
s = s.translate(
maketrans(
'!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~。、・「」',
'!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}〜。、・「」'
)
)
s = self._remove_extra_spaces(s)
s = self._unicode_normalize('!”#$%&’()*+,-./:;<>?@[¥]^_`{|}〜', s) # keep =,・,「,」
s = re.sub('[’]', '\'', s)
s = re.sub('[”]', '"', s)
return s
def parse(self, doc):
'''
日本語を分かち書きにするためのメソッド
動詞と名詞の基本形だけを抜き出す
@param string doc
@return string 分かち書きにしたdocumentを返す
'''
if not doc:
return ''
# -Ochasenを指定するとtabで区切られる。こんな感じ。
# ['C言語\tシーゲンゴ\tC言語\t名詞-固有名詞-一般\t\t']
neologd_path = os.path.expanduser('~') + '/mecab-ipadic-neologd/mecab-ipadic-neologd'
tagger = MeCab.Tagger('-Ochasen -d %s' % neologd_path)
# 正規化した上で分形態素解析して、1行ごとに区切ってリスト化する
words = tagger.parse(self._normalize_neologd(doc)).split('\n')
val = []
for word in words:
# EOS, ''の場合は無視
if word == 'EOS' or word == '':
continue
# タブで区切り、リスト化
word_info = word.split('\t')
# 4番目に何詞かが格納されている。動詞と名詞だけを対象にする。
if word_info[3][0:2] in ['動詞', '名詞']:
# 3番目に基本形が格納されている
val.append(word_info[2])
# ホワイトスペースでつなげて1つの文字列にする
return ' '.join(val)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment