Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
ごく適当なMeCabのPythonラッパー
import re
from collections import namedtuple
import MeCab
Node = namedtuple('Node', [
'surface',
'base',
'reading',
'pos',
'pos_detail_1',
'pos_detail_2',
'pos_detail_3',
'conjugated_type',
'conjugated_form',
])
class Parser:
_WHITESPACE = re.compile(r'\s+')
_DICDIR = {
'neologd': '/usr/local/lib/mecab/dic/mecab-ipadic-neologd'
}
def __init__(self, mecab_dic):
option = self._mecab_option(mecab_dic)
self._tagger = MeCab.Tagger(option)
def _mecab_option(self, mecab_dic):
options = {'-O': 'chasen'}
if mecab_dic is not None:
options['-d'] = self._DICDIR[mecab_dic]
return ' '.join(f'{k} {v}' for k, v in options.items())
def parse(self, text):
ntext = self._WHITESPACE.sub(' ', text) # 改行やタブを除去
result = self._tagger.parse(ntext)
return [self._node(l) for l in result.splitlines() if l != 'EOS']
def _node(self, line):
fields = line.split('\t')
pos_fields = fields[3].split('-')
return Node(
surface=fields[0],
base=fields[2],
reading=fields[1],
pos=pos_fields[0],
pos_detail_1=self._get(pos_fields, 1),
pos_detail_2=self._get(pos_fields, 2),
pos_detail_3=self._get(pos_fields, 3),
conjugated_type=fields[4] or None,
conjugated_form=fields[5] or None,
)
@staticmethod
def _get(l, i):
return l[i] if len(l) > i else None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment