Last active
April 11, 2019 02:28
-
-
Save incep/7923570 to your computer and use it in GitHub Desktop.
Get only preferred sort of nouns from MeCab.Tagger.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf-8 | |
import re | |
import MeCab | |
def mecab_only_content_nouns(text_u): | |
""" Return preferred tokens from result from MeCab.Tagger. | |
- preferred tokens = 名詞(一般), 名詞(サ変接続) | |
* exclude symbols | |
* exclude ones with only one hiragana or katakana character | |
""" | |
def condition(s, f): | |
return u"名詞" in f and \ | |
not u"固有名詞" in f and \ | |
re.match(u"[0-9a-zA-Z]+|[ぁ-んァ-ヶヲ-゚一-龠]{2,}|^[一-龠]$", s, re.U) | |
ret = [] | |
add = ret.append | |
text = text_u.encode('utf-8') # Input must be a str object. | |
tagger = MeCab.Tagger() | |
node = tagger.parseToNode(text) | |
while node: | |
surface_u, feature_u = node.surface.decode('utf-8'), node.feature.decode('utf-8') | |
#print surface_u, feature_u #d | |
if condition(surface_u, feature_u): | |
add(surface_u) | |
node = node.next | |
return ret | |
# ur".." : Raw text in Unicode | |
text = ur""" | |
ですからぁ〜、マックではここに「こ」って書かないでください。 | |
必要ない名詞(記号とか)が入っていると誤作動しちゃいます>< | |
"""[1:-1] # <- 最初の改行文字と最後の改行文字を取り除く | |
ary = mecab_only_content_nouns(text) | |
for i, itm in enumerate(ary): | |
print i + 1, itm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment