Skip to content

Instantly share code, notes, and snippets.

@takegue
Last active August 29, 2015 14:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takegue/2ee794dad639cd89e8ef to your computer and use it in GitHub Desktop.
Save takegue/2ee794dad639cd89e8ef to your computer and use it in GitHub Desktop.
MeCab implementation for Python using subprocess
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import subprocess
import itertools as itt
class MeCab():
def __init__(self, opts=[]):
self.opts = opts
self._process = subprocess.Popen(
list(itt.chain(['mecab'], opts)),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
universal_newlines =True,
)
if '-Owakati' in opts or 'wakati' in opts:
self.parse = self.wakati_parse
else:
self.parse = self.default_parse
def wakati_parse(self, iterable):
for line in iterable:
self._process.stdin.write(line+'\n')
output = self._process.stdout.readline()
yield output.strip().split()
def default_parse(self, iterable):
buff = []
for line in iterable:
self._process.stdin.write(line+'\n')
while True:
output = self._process.stdout.readline().strip()
if output.startswith('EOS'):
break
buff.append(Morph.from_mecab_format(output))
yield buff
buff = []
class Morph():
def __init__(self):
self.surface = ''
self.attrs = []
@classmethod
def from_mecab_format(cls, line):
surface, attrs = line.split()
attrs = attrs.split(',')
morph = Morph()
morph.surface = surface
morph.attrs = attrs
morph.pos = attrs[0]
return morph
def test_mecab():
parser = MeCab(['-Owakati'])
morphs = next(parser.parse(['すもももももももものうち']))
assert morphs == ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']
if __name__ == '__main__':
test_mecab()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment