Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ikegami-yukino/67856c105c1545565132 to your computer and use it in GitHub Desktop.
Save ikegami-yukino/67856c105c1545565132 to your computer and use it in GitHub Desktop.
PythonでMeCabの制約付き解析を使う ref: http://qiita.com/yukinoi/items/4e7afb5e72b3a46da0f2
# -*- coding: utf-8 -*-
import re
import MeCab
from MeCab import MECAB_ANY_BOUNDARY, MECAB_INSIDE_TOKEN, MECAB_TOKEN_BOUNDARY
DICINFO_KEYS = ('charset', 'filename', 'lsize', 'rsize', 'size', 'type', 'version')
class Tagger(MeCab.Tagger):
def dictionary_info(self):
info = MeCab._MeCab.Tagger_dictionary_info(self)
return {key: getattr(info, key) for key in DICINFO_KEYS}
def split_sentence(self, sentence, pattern):
"""
Args:
<str> sentence
<str> pattern: regex pattern
Returns:
<str> token
<bool> match
"""
last_found_position = 0
for m in re.finditer(pattern, sentence):
if last_found_position < m.start():
yield (sentence[last_found_position:m.start()], False)
last_found_position = m.start()
yield (sentence[last_found_position:m.end()], True)
last_found_position = m.end()
if last_found_position < len(sentence):
yield (sentence[last_found_position:], False)
def boundary_constraint_parse(self, sentence, pattern='.', any_boundary=False):
"""
Arg:
<list> tokens
Return:
<str> result
"""
lattice = MeCab.Lattice()
lattice.set_sentence(''.join(sentence))
if any_boundary:
default_boundary_constraint = MECAB_ANY_BOUNDARY
else:
default_boundary_constraint = MECAB_INSIDE_TOKEN
byte_position = 0
lattice.set_boundary_constraint(byte_position, MECAB_TOKEN_BOUNDARY)
charset = self.dictionary_info()['charset']
for (token, match) in self.split_sentence(sentence, pattern):
byte_position += 1
if match:
boundary_constraint = MECAB_INSIDE_TOKEN
else:
boundary_constraint = default_boundary_constraint
for i in range(1, len(token.encode(charset))):
lattice.set_boundary_constraint(byte_position, boundary_constraint)
byte_position += 1
lattice.set_boundary_constraint(byte_position, MECAB_TOKEN_BOUNDARY)
if self.parse(lattice):
return lattice.toString()
if __name__ == '__main__':
tagger = Tagger()
text = 'ポエム読むならQiita最高'
print('形態素境界制約付き解析\n')
print(tagger.boundary_constraint_parse(text, '[a-zA-Z0-9\s\-]+', any_boundary=True))
形態素境界制約付き解析
ポエム 名詞,一般,*,*,*,*,ポエム,ポエム,ポエム
読む 動詞,自立,*,*,五段・マ行,基本形,読む,ヨム,ヨム
なら 助動詞,*,*,*,特殊・ダ,仮定形,だ,ナラ,ナラ
Qiita 名詞,一般,*,*,*,*,*
最高 名詞,一般,*,*,*,*,最高,サイコウ,サイコー
EOS
# -*- coding: utf-8 -*-
import MeCab
DICINFO_KEYS = ('charset', 'filename', 'lsize', 'rsize', 'size', 'type', 'version')
class Tagger(MeCab.Tagger):
def dictionary_info(self):
info = MeCab._MeCab.Tagger_dictionary_info(self)
return {key: getattr(info, key) for key in DICINFO_KEYS}
def feature_constraint_parse(self, tokens):
"""
Arg:
tokens (list of (str, str))
Return:
result (str)
"""
lattice = MeCab.Lattice()
sentence = ''.join(map(lambda x: x[0], tokens))
lattice.set_sentence(sentence)
start_position = 0
charset = charset = self.dictionary_info()['charset']
for x in tokens:
if len(x) == 2:
(token, pos) = x
else:
token = x[0]
pos = '*'
end_position = start_position + len(token.encode(charset))
lattice.set_feature_constraint(start_position, end_position, pos)
start_position = end_position
if self.parse(lattice):
node = lattice.begin_nodes(0)
while node:
yield node
node = node.next
if __name__ == '__main__':
tagger = Tagger()
print('品詞制約付き解析\n')
labeled_tokens = [['くぅ〜', '感動詞'],
['マミさん', '名詞'],
['の'], ['紅茶'], ['めちゃウマ'], ['っす'], ['よ'], ['〜']]
for node in tagger.feature_constraint_parse(labeled_tokens):
print(node.surface, node.feature)
品詞制約付き解析
くぅ〜 名詞,サ変接続,*,*,*,*,*
マミさん 名詞,一般,*,*,*,*,*
の 助詞,連体化,*,*,*,*,の,ノ,ノ
紅茶 名詞,一般,*,*,*,*,紅茶,コウチャ,コーチャ
めちゃウマ 名詞,一般,*,*,*,*,*
っす 助動詞,*,*,*,特殊・デス,基本形,っす,ッス,ッス
よ 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
〜 記号,一般,*,*,*,*,〜,〜,〜
BOS/EOS,*,*,*,*,*,*,*,*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment