Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save panzerstadt/44613bb7f725a649ac2389431130513c to your computer and use it in GitHub Desktop.
Save panzerstadt/44613bb7f725a649ac2389431130513c to your computer and use it in GitHub Desktop.
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from many_stop_words import get_stop_words
import re
def normalize_sentences(sentence, language='en', debug=False):
stemmer = LancasterStemmer()
# regex set to grab english and
regex_ja = r"""
[A-Za-z ]+|[\u3000-\u303F]+|[\u3040-\u309F]+|[\u30A0-\u30FF]+|[\uFF00-\uFFEF]+|[\u4E00-\u9FAF]+|[\u2605-\u2606]+|[\u2190-\u2195]+|\u203B
"""
matches = re.finditer(regex_ja, sentence, re.MULTILINE | re.IGNORECASE | re.VERBOSE | re.UNICODE)
matches = [match.group() for match in matches]
s = ''.join(matches)
# set ignored words (overly common words)
# tokenize words
if language == 'en':
ignore_words = set(stopwords.words('english')) # english
# nltk's word_tokenize for english
words = word_tokenize(s.lower())
else:
ignore_words = get_stop_words(language) # has japanese
# NOT using kytea (Kyoto University) for word tokenization (https://chezou.hatenablog.com/entry/20110715/1310699249)
# using mecab for dictionary
# with JapaneseTokenizer (https://pypi.org/project/JapaneseTokenizer/)
import JapaneseTokenizer
mecab_wrapper = JapaneseTokenizer.MecabWrapper(dictType='ipadic')
words = mecab_wrapper.tokenize(s).convert_list_object()
# clean blanks
words = [w for w in words if w is not ' ']
if debug: print('ignoring words: ', ignore_words)
return [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
@iRaM-sAgOr
Copy link

if i run your code..some exception is raising. i am sharing the exception and errors. Please help:
[Y/10/22 18:07:59]ERROR - mecab_wrapper.py#__CallMecab:169: ('',)
[Y/10/22 18:07:59]ERROR - mecab_wrapper.py#__CallMecab:170: Possibly Path to userdict is invalid. Check the path

RuntimeError Traceback (most recent call last)
~/anaconda3/envs/nlp_jupyter_env/lib/python3.7/site-packages/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py in __CallMecab(self)
166 try:
--> 167 mecabObj = MeCab.Tagger(cmMecabCall)
168 except Exception as e:

~/anaconda3/envs/nlp_jupyter_env/lib/python3.7/site-packages/MeCab/init.py in init(self, *args)
96 with _mecabrc_for_bundled_dictionary():
---> 97 super(Tagger, self).init(*args)
98

RuntimeError:

During handling of the above exception, another exception occurred:

CalledProcessError Traceback (most recent call last)
in
41 return [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
42 word = normalize_sentences('東海道線が止まってる間、辻堂駅のまわりをIngressというゲームを起動したXperiaでうろうろ。iPhoneよりバージョンも進んでるし画面も大きいし捗る。まだLevel2ですけど、楽しくなってきた。これ時間潰しちゃうねぇ、だめだねぇ。家の近くでもうろうろしてしまった。',
---> 43 language='ja', debug=False)

in normalize_sentences(sentence, language, debug)
31 # with JapaneseTokenizer (https://pypi.org/project/JapaneseTokenizer/)
32 import JapaneseTokenizer
---> 33 mecab_wrapper = JapaneseTokenizer.MecabWrapper(dictType='ipadic')
34 words = mecab_wrapper.tokenize(s).convert_list_object()
35

~/anaconda3/envs/nlp_jupyter_env/lib/python3.7/site-packages/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py in init(self, dictType, pathUserDictCsv, path_mecab_config, path_dictionary, string_encoding)
62
63 logger.info("mecab dictionary path is detected under {}".format(self._mecab_dictionary_path))
---> 64 self.mecabObj = self.__CallMecab()
65
66 assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", "jumandic", "unidic", None], \

~/anaconda3/envs/nlp_jupyter_env/lib/python3.7/site-packages/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py in __CallMecab(self)
169 logger.error(e.args)
170 logger.error("Possibly Path to userdict is invalid. Check the path")
--> 171 raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to initialize Mecab object")
172
173 return mecabObj

CalledProcessError: Command 'Failed to initialize Mecab object' died with <Signals.SIGHUP: 1>.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment