Skip to content

Instantly share code, notes, and snippets.

@computingfreak
Created June 14, 2017 07:24
Show Gist options
  • Save computingfreak/d91dee31631afb7457714cf89fc458d4 to your computer and use it in GitHub Desktop.
Save computingfreak/d91dee31631afb7457714cf89fc458d4 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from __future__ import print_function
from natto import MeCab
def tokenize(text):
tokens = []
with MeCab('-F%f[0],%f[6]') as nm:
for n in nm.parse(text, as_nodes=True):
# ignore any end-of-sentence nodes
if not n.is_eos() and n.is_nor():
klass, word = n.feature.split(',', 1)
#if clazz != u'BOS/EOS':
#word_class.append((word, clazz))
if klass in ['名詞', '形容詞', '形容動詞','動詞']:
tokens.append(word)
return tokens
tkns = tokenize('私の名前は太郎です。')
for tk in tkns:
print(tk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment