Skip to content

Instantly share code, notes, and snippets.

@incep
Last active April 11, 2019 02:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save incep/7923570 to your computer and use it in GitHub Desktop.
Save incep/7923570 to your computer and use it in GitHub Desktop.
Get only preferred sort of nouns from MeCab.Tagger.
#coding: utf-8
import re
import MeCab
def mecab_only_content_nouns(text_u):
""" Return preferred tokens from result from MeCab.Tagger.
- preferred tokens = 名詞(一般), 名詞(サ変接続)
* exclude symbols
* exclude ones with only one hiragana or katakana character
"""
def condition(s, f):
return u"名詞" in f and \
not u"固有名詞" in f and \
re.match(u"[0-9a-zA-Z]+|[ぁ-んァ-ヶヲ-゚一-龠]{2,}|^[一-龠]$", s, re.U)
ret = []
add = ret.append
text = text_u.encode('utf-8') # Input must be a str object.
tagger = MeCab.Tagger()
node = tagger.parseToNode(text)
while node:
surface_u, feature_u = node.surface.decode('utf-8'), node.feature.decode('utf-8')
#print surface_u, feature_u #d
if condition(surface_u, feature_u):
add(surface_u)
node = node.next
return ret
# ur".." : Raw text in Unicode
text = ur"""
ですからぁ〜、マックではここに「こ」って書かないでください。
必要ない名詞(記号とか)が入っていると誤作動しちゃいます><
"""[1:-1] # <- 最初の改行文字と最後の改行文字を取り除く
ary = mecab_only_content_nouns(text)
for i, itm in enumerate(ary):
print i + 1, itm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment