Created
March 16, 2017 07:51
-
-
Save yubessy/c07e0169bf4afa81c6786d8452626a4c to your computer and use it in GitHub Desktop.
ごく適当なMeCabのPythonラッパー
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import namedtuple | |
import MeCab | |
Node = namedtuple('Node', [ | |
'surface', | |
'base', | |
'reading', | |
'pos', | |
'pos_detail_1', | |
'pos_detail_2', | |
'pos_detail_3', | |
'conjugated_type', | |
'conjugated_form', | |
]) | |
class Parser: | |
_WHITESPACE = re.compile(r'\s+') | |
_DICDIR = { | |
'neologd': '/usr/local/lib/mecab/dic/mecab-ipadic-neologd' | |
} | |
def __init__(self, mecab_dic): | |
option = self._mecab_option(mecab_dic) | |
self._tagger = MeCab.Tagger(option) | |
def _mecab_option(self, mecab_dic): | |
options = {'-O': 'chasen'} | |
if mecab_dic is not None: | |
options['-d'] = self._DICDIR[mecab_dic] | |
return ' '.join(f'{k} {v}' for k, v in options.items()) | |
def parse(self, text): | |
ntext = self._WHITESPACE.sub(' ', text) # 改行やタブを除去 | |
result = self._tagger.parse(ntext) | |
return [self._node(l) for l in result.splitlines() if l != 'EOS'] | |
def _node(self, line): | |
fields = line.split('\t') | |
pos_fields = fields[3].split('-') | |
return Node( | |
surface=fields[0], | |
base=fields[2], | |
reading=fields[1], | |
pos=pos_fields[0], | |
pos_detail_1=self._get(pos_fields, 1), | |
pos_detail_2=self._get(pos_fields, 2), | |
pos_detail_3=self._get(pos_fields, 3), | |
conjugated_type=fields[4] or None, | |
conjugated_form=fields[5] or None, | |
) | |
@staticmethod | |
def _get(l, i): | |
return l[i] if len(l) > i else None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment