shigeomi-takada/wakati.py

## wakati.py
# -*- coding: utf-8 -*-

import re
import os

import MeCab
import unicodedata


class Wakati():
    '''
    MeCabを使った分かち書き
    固有名詞等の辞書には mecab-ipadic-neologd を採用している
    文字列の正規化は下記ページを参照
    https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
    '''

    def _unicode_normalize(self, cls, s):
        pt = re.compile('([{}]+)'.format(cls))

        def norm(c):
            return unicodedata.normalize('NFKC', c) if pt.match(c) else c

        s = ''.join(norm(x) for x in re.split(pt, s))
        s = re.sub('－', '-', s)
        return s

    def _remove_extra_spaces(self, s):
        s = re.sub('[ 　]+', ' ', s)
        blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                          '\u3040-\u309F',  # HIRAGANA
                          '\u30A0-\u30FF',  # KATAKANA
                          '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                          '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                          ))
        basic_latin = '\u0000-\u007F'

        def remove_space_between(cls1, cls2, s):
            p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
            while p.search(s):
                s = p.sub(r'\1\2', s)
            return s

        s = remove_space_between(blocks, blocks, s)
        s = remove_space_between(blocks, basic_latin, s)
        s = remove_space_between(basic_latin, blocks, s)
        return s

    def _normalize_neologd(self, s):
        s = s.strip()
        s = self._unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

        def maketrans(f, t):
            return {ord(x): ord(y) for x, y in zip(f, t)}

        s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
        s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
        s = re.sub('[~∼∾〜〰～]', '', s)  # remove tildes
        s = s.translate(
            maketrans(
                '!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
                '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'
            )
        )

        s = self._remove_extra_spaces(s)
        s = self._unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
        s = re.sub('[’]', '\'', s)
        s = re.sub('[”]', '"', s)
        return s

    def parse(self, doc):
        '''
        日本語を分かち書きにするためのメソッド
        動詞と名詞の基本形だけを抜き出す
        @param string doc
        @return string 分かち書きにしたdocumentを返す
        '''

        if not doc:
            return ''

        # -Ochasenを指定するとtabで区切られる。こんな感じ。
        # ['C言語\tシーゲンゴ\tC言語\t名詞-固有名詞-一般\t\t']
        neologd_path = os.path.expanduser('~') + '/mecab-ipadic-neologd/mecab-ipadic-neologd'
        tagger = MeCab.Tagger('-Ochasen -d %s' % neologd_path)

        # 正規化した上で分形態素解析して、1行ごとに区切ってリスト化する
        words = tagger.parse(self._normalize_neologd(doc)).split('\n')

        val = []
        for word in words:
            # EOS, ''の場合は無視
            if word == 'EOS' or word == '':
                continue

            # タブで区切り、リスト化
            word_info = word.split('\t')

            # 4番目に何詞かが格納されている。動詞と名詞だけを対象にする。
            if word_info[3][0:2] in ['動詞', '名詞']:
                # 3番目に基本形が格納されている
                val.append(word_info[2])

        # ホワイトスペースでつなげて1つの文字列にする
        return ' '.join(val)
	# -- coding: utf-8 --

	import re
	import os

	import MeCab
	import unicodedata


	class Wakati():
	'''
	MeCabを使った分かち書き
	固有名詞等の辞書には mecab-ipadic-neologd を採用している
	文字列の正規化は下記ページを参照
	https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
	'''

	def _unicode_normalize(self, cls, s):
	pt = re.compile('([{}]+)'.format(cls))

	def norm(c):
	return unicodedata.normalize('NFKC', c) if pt.match(c) else c

	s = ''.join(norm(x) for x in re.split(pt, s))
	s = re.sub('－', '-', s)
	return s

	def _remove_extra_spaces(self, s):
	s = re.sub('[ 　]+', ' ', s)
	blocks = ''.join(('\u4E00-\u9FFF', # CJK UNIFIED IDEOGRAPHS
	'\u3040-\u309F', # HIRAGANA
	'\u30A0-\u30FF', # KATAKANA
	'\u3000-\u303F', # CJK SYMBOLS AND PUNCTUATION
	'\uFF00-\uFFEF' # HALFWIDTH AND FULLWIDTH FORMS
	))
	basic_latin = '\u0000-\u007F'

	def remove_space_between(cls1, cls2, s):
	p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
	while p.search(s):
	s = p.sub(r'\1\2', s)
	return s

	s = remove_space_between(blocks, blocks, s)
	s = remove_space_between(blocks, basic_latin, s)
	s = remove_space_between(basic_latin, blocks, s)
	return s

	def _normalize_neologd(self, s):
	s = s.strip()
	s = self._unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

	def maketrans(f, t):
	return {ord(x): ord(y) for x, y in zip(f, t)}

	s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s) # normalize hyphens
	s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s) # normalize choonpus
	s = re.sub('[~∼∾〜〰～]', '', s) # remove tildes
	s = s.translate(
	maketrans(
	'!"#$%&\'()*+,-./:;<=>?@[¥]^_`{\|}~｡､･｢｣',
	'！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'
	)
	)

	s = self._remove_extra_spaces(s)
	s = self._unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s) # keep ＝,・,「,」
	s = re.sub('[’]', '\'', s)
	s = re.sub('[”]', '"', s)
	return s

	def parse(self, doc):
	'''
	日本語を分かち書きにするためのメソッド
	動詞と名詞の基本形だけを抜き出す
	@param string doc
	@return string 分かち書きにしたdocumentを返す
	'''

	if not doc:
	return ''

	# -Ochasenを指定するとtabで区切られる。こんな感じ。
	# ['C言語\tシーゲンゴ\tC言語\t名詞-固有名詞-一般\t\t']
	neologd_path = os.path.expanduser('~') + '/mecab-ipadic-neologd/mecab-ipadic-neologd'
	tagger = MeCab.Tagger('-Ochasen -d %s' % neologd_path)

	# 正規化した上で分形態素解析して、1行ごとに区切ってリスト化する
	words = tagger.parse(self._normalize_neologd(doc)).split('\n')

	val = []
	for word in words:
	# EOS, ''の場合は無視
	if word == 'EOS' or word == '':
	continue

	# タブで区切り、リスト化
	word_info = word.split('\t')

	# 4番目に何詞かが格納されている。動詞と名詞だけを対象にする。
	if word_info[3][0:2] in ['動詞', '名詞']:
	# 3番目に基本形が格納されている
	val.append(word_info[2])

	# ホワイトスペースでつなげて1つの文字列にする
	return ' '.join(val)