Skip to content

Instantly share code, notes, and snippets.

@shimizukawa
Forked from yono/extractword.py
Last active June 6, 2023 02:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shimizukawa/5931218 to your computer and use it in GitHub Desktop.
Save shimizukawa/5931218 to your computer and use it in GitHub Desktop.
MeCabの出力結果を基に接頭辞や接尾辞を連結する。fork元のコードと機能は同じ。リファクタリングしました。
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
from collections import OrderedDict #python2.7 or later
import MeCab
class DetermineDescriptor(object):
def __init__(self, *allowed_features):
self.allowed_features = allowed_features
def __get__(self, instance, klass):
return any(map(instance.feature.startswith, self.allowed_features))
class Word(object):
def __init__(self,surface,feature):
self.surface = surface
self.feature = feature
def __repr__(self):
return u'<{0.__class__.__name__} "{0.surface}", "{0.feature}">'.format(self).encode('utf-8')
@property
def is_connected(self):
allowed_feature = (
u"名詞,一般",
u"名詞,数",
u"名詞,サ変接続",
u"名詞,接尾,一般",
u"名詞,接尾,サ変接続",
u"名詞,固有名詞",
u"名詞,形容動詞語幹",
u"名詞,副詞可能",
u"記号,アルファベット",
)
disallowed_symbols = set(map(lambda x:x, u"()[]<>|\"';,"))
return (self.feature.startswith(allowed_feature) and
self.surface not in disallowed_symbols)
is_adjective = DetermineDescriptor(
u"名詞,形容動詞語幹",
u"名詞,ナイ形容詞語幹",
)
is_prefix = DetermineDescriptor(
u"接頭詞,名詞接続",
)
is_postfix = DetermineDescriptor(
u"名詞,接尾,形容動詞語幹",
u"名詞,接尾,一般",
)
is_digit_prefix = DetermineDescriptor(
u"接頭詞,数接続",
)
is_numerative = DetermineDescriptor(
u"名詞,接尾,助数詞",
)
is_digit = DetermineDescriptor(
u"名詞,数",
)
is_noun = DetermineDescriptor(
u"名詞",
)
@property
def is_digit_only(self):
return self.surface.isdigit()
_is_symbol_only = re.compile(r'^[!"#\$\%\&\'\(\)\*\+,\-\./:;\<\=\>\?\@\[\\\]\^\_\`\{\}\~\|]+$').search
@property
def is_symbol_only(self):
return self._is_symbol_only(self.surface)
def combine(self, other):
if self.is_prefix and other.is_connected:
## 接頭辞は次の名詞に繋ぐ
self.surface += other.surface
other.is_noun = True
other.is_prefix = False
elif self.is_noun and other.is_postfix:
## 接尾辞は一個前が名詞だったら繋ぐ
self.surface += other.surface
other.is_noun = False
other.is_prefix = False
elif self.is_digit_prefix and other.is_digit:
## 数接続の接頭詞の次に数字だったらつなぐ
self.surface += other.surface
other.is_digit = True
other.is_digit_prefix = False
elif self.is_digit and other.is_numerative:
## 数字の次に助数詞が来たらつなげる
self.surface += other.surface
other.is_digit = False
other.is_noun = False
else:
if (other.is_connected or other.is_adjective or other.is_postfix or
other.is_prefix or other.is_digit_prefix or other.is_noun
):
other.is_noun = other.is_connected
return False #selfを確定
else:
other.is_noun = other.is_connected
return True #selfは未確定
def mecab_parse_iterator(tagger, text):
"""tagger(MeCab.Tagger instance), text(unicode) => word(unicode), feature(unicode)"""
text = text.encode('utf-8')
node = tagger.parseToNode(text)
while node:
yield Word(node.surface.decode('utf-8'), node.feature.decode('utf-8'))
node = node.next
def extract_noun(text):
pword = Word('', '')
result = []
for word in mecab_parse_iterator(MeCab.Tagger(), text):
if not pword.combine(word):
result.append(word)
pword = word
worddic = OrderedDict()
for word in result:
if not (word.is_digit_only or word.is_symbol_only):
worddic[word.surface] = worddic.get(word.surface, 0) + 1
return worddic
if __name__ == "__main__":
import sys
text = sys.argv[1]
worddic = extract_noun(text)
for word,num in worddic.items():
print word,num
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment