Created
May 30, 2018 09:46
-
-
Save panzerstadt/44613bb7f725a649ac2389431130513c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem.lancaster import LancasterStemmer | |
from nltk.corpus import stopwords | |
from nltk import word_tokenize | |
from many_stop_words import get_stop_words | |
import re | |
def normalize_sentences(sentence, language='en', debug=False): | |
stemmer = LancasterStemmer() | |
# regex set to grab english and | |
regex_ja = r""" | |
[A-Za-z ]+|[\u3000-\u303F]+|[\u3040-\u309F]+|[\u30A0-\u30FF]+|[\uFF00-\uFFEF]+|[\u4E00-\u9FAF]+|[\u2605-\u2606]+|[\u2190-\u2195]+|\u203B | |
""" | |
matches = re.finditer(regex_ja, sentence, re.MULTILINE | re.IGNORECASE | re.VERBOSE | re.UNICODE) | |
matches = [match.group() for match in matches] | |
s = ''.join(matches) | |
# set ignored words (overly common words) | |
# tokenize words | |
if language == 'en': | |
ignore_words = set(stopwords.words('english')) # english | |
# nltk's word_tokenize for english | |
words = word_tokenize(s.lower()) | |
else: | |
ignore_words = get_stop_words(language) # has japanese | |
# NOT using kytea (Kyoto University) for word tokenization (https://chezou.hatenablog.com/entry/20110715/1310699249) | |
# using mecab for dictionary | |
# with JapaneseTokenizer (https://pypi.org/project/JapaneseTokenizer/) | |
import JapaneseTokenizer | |
mecab_wrapper = JapaneseTokenizer.MecabWrapper(dictType='ipadic') | |
words = mecab_wrapper.tokenize(s).convert_list_object() | |
# clean blanks | |
words = [w for w in words if w is not ' '] | |
if debug: print('ignoring words: ', ignore_words) | |
return [stemmer.stem(w.lower()) for w in words if w not in ignore_words] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
if i run your code..some exception is raising. i am sharing the exception and errors. Please help:
[Y/10/22 18:07:59]ERROR - mecab_wrapper.py#__CallMecab:169: ('',)
[Y/10/22 18:07:59]ERROR - mecab_wrapper.py#__CallMecab:170: Possibly Path to userdict is invalid. Check the path
RuntimeError Traceback (most recent call last)
~/anaconda3/envs/nlp_jupyter_env/lib/python3.7/site-packages/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py in __CallMecab(self)
166 try:
--> 167 mecabObj = MeCab.Tagger(cmMecabCall)
168 except Exception as e:
~/anaconda3/envs/nlp_jupyter_env/lib/python3.7/site-packages/MeCab/init.py in init(self, *args)
96 with _mecabrc_for_bundled_dictionary():
---> 97 super(Tagger, self).init(*args)
98
RuntimeError:
During handling of the above exception, another exception occurred:
CalledProcessError Traceback (most recent call last)
in
41 return [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
42 word = normalize_sentences('東海道線が止まってる間、辻堂駅のまわりをIngressというゲームを起動したXperiaでうろうろ。iPhoneよりバージョンも進んでるし画面も大きいし捗る。まだLevel2ですけど、楽しくなってきた。これ時間潰しちゃうねぇ、だめだねぇ。家の近くでもうろうろしてしまった。',
---> 43 language='ja', debug=False)
in normalize_sentences(sentence, language, debug)
31 # with JapaneseTokenizer (https://pypi.org/project/JapaneseTokenizer/)
32 import JapaneseTokenizer
---> 33 mecab_wrapper = JapaneseTokenizer.MecabWrapper(dictType='ipadic')
34 words = mecab_wrapper.tokenize(s).convert_list_object()
35
~/anaconda3/envs/nlp_jupyter_env/lib/python3.7/site-packages/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py in init(self, dictType, pathUserDictCsv, path_mecab_config, path_dictionary, string_encoding)
62
63 logger.info("mecab dictionary path is detected under {}".format(self._mecab_dictionary_path))
---> 64 self.mecabObj = self.__CallMecab()
65
66 assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", "jumandic", "unidic", None], \
~/anaconda3/envs/nlp_jupyter_env/lib/python3.7/site-packages/JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py in __CallMecab(self)
169 logger.error(e.args)
170 logger.error("Possibly Path to userdict is invalid. Check the path")
--> 171 raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to initialize Mecab object")
172
173 return mecabObj
CalledProcessError: Command 'Failed to initialize Mecab object' died with <Signals.SIGHUP: 1>.