Last active
May 28, 2019 05:45
-
-
Save knknkn1162/c81bcd15e0e4f20304559f25a58c38fc to your computer and use it in GitHub Desktop.
MeCabをブーストさせよう ref: https://qiita.com/knknkn1162/items/8c12f42dd167aae01c02
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 新語辞書 mecab-ipadic-neologd の追加(2回目以降は) 辞書更新 | |
./bin/install-mecab-ipadic-neologd -n -a |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# terminal上では、これで使える。 | |
mecab -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# chasen形式の新語のcsvが存在しているディレクトリに飛ぶ | |
cd /home/foo/bar | |
# GCP natural language APIで拾った語彙のコスト自動推定 | |
## macの場合(linuxの場合 /usr/libexec/mecab/mecab-dict-index) | |
/usr/local/Cellar/mecab/0.996/libexec/mecab/mecab-dict-index -m ./mecab-ipadic-2.7.0-20070801.model \ | |
-d ./mecab-ipadic -f utf-8 -t utf-8 \ | |
-a test_dic.csv -u new_test_dic.csv | |
./mecab-ipadic-2.7.0-20070801.model is not a binary model. reopen it as text mode... | |
reading new_test_dic.csv ... | |
done! | |
# 再び辞書を作成 | |
/usr/local/Cellar/mecab/0.996/libexec/mecab/mecab-dict-index -m ./mecab-ipadic-2.7.0-20070801.model \ | |
-d ./mecab-ipadic \ | |
-f utf-8 -t utf-8 \ | |
-u ./site.dic \ | |
./new_test_dic.csv # 新規に追加するcsv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vim /usr/local/etc/mecabrc | |
# ユーザ辞書への追加 | |
# 適当な行に付け足す | |
userdic = /home/foo/bar/site.dic |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# terminal上では、これで使える。 | |
mecab -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MeCab | |
class Morph(object): | |
kinds = None | |
log_stamp = 1000 | |
@classmethod | |
def set_log(cls, num): | |
cls.log_stamp = num | |
@classmethod | |
def set_kind(cls, *args): | |
cls.kinds = args | |
return cls | |
@classmethod | |
def reset_kind(cls): | |
cls.kinds = None | |
return cls | |
def set_file(self, file, up_to=100000000): | |
fi = codecs.open(file, 'r', 'utf-8', 'ignore') | |
self.lines = fi | |
return self | |
def set_sentence(self, sentence): | |
self.lines = sentence.split('\n') | |
return self | |
def set_chasen(self, s): | |
import MeCab | |
self.chasen = MeCab.Tagger(s) | |
return self | |
def set_wakati(self, s): | |
import MeCab | |
self.wakati = MeCab.Tagger(s) | |
return self | |
def __init__(self, dic_path=None): | |
try: | |
self.chasen = MeCab.Tagger('-Ochasen -d {}'.format(dic_path)) | |
self.wakati = MeCab.Tagger('-Owakati -d {}'.format(dic_path)) | |
except: | |
self.chasen = MeCab.Tagger('-Ochasen') | |
self.wakati = MeCab.Tagger('-Owakati') | |
def wakatigaki(self): | |
res = '' | |
for line in self.lines: | |
res += self.wakati.parse(line) | |
return res | |
def extract(self, feature=True): | |
feature_flag = feature | |
'''return type of list''' | |
tagger = self.chasen | |
for i, line in enumerate(self.lines): | |
line = line.strip() | |
if (i + 1) % self.log_stamp == 0: | |
print('line {}'.format(i + 1)) | |
# 最後はEOSなので、読み飛ばす | |
chunks = tagger.parse(line).splitlines()[:-1] | |
for idx, chunk in enumerate(chunks): | |
try: | |
# #表層形\t読み\t原型\t品詞なので、(原型、品詞)のみとり出す。 | |
_surface, _yomi, origin, feature = chunk.split('\t')[:4] | |
except: | |
import traceback | |
print('×', end="/") | |
continue | |
origin = origin.lower() | |
if Morph.kinds is None: | |
if feature_flag: | |
yield (origin, feature.split('-')[0]) | |
elif not feature_flag: | |
yield origin | |
continue | |
for kind in Morph.kinds: | |
if feature.startswith(kind): | |
if feature_flag: | |
yield (origin, kind) | |
elif not feature_flag: | |
yield origin | |
break | |
return 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# txtを事前に用意しておく | |
mp = Morph(dic_path = path) | |
mp.set_sentence(txt) | |
print(list(mp.extract())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from google.cloud import language | |
class GCPNaturalLanguage(object): | |
def __init__(self, upper=10000): | |
# Instantiates a client | |
self.client = language.Client() | |
self.upper = upper | |
def get_entity(self, text): | |
length = len(text) | |
if length > self.upper: | |
print("{} .. too long".format(length)) | |
return {} | |
document = self.client.document_from_text(text, language='ja') | |
# Detects the sentiment of the text | |
res = document.analyze_entities() | |
print("{} characters => done!".format(len(text))) | |
dic = {} | |
for entity in res.entities: | |
for m in entity.mentions: | |
dic.update({m.text.begin_offset: m.text.content}) | |
return dic |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import GCPNaturalLanguage | |
# txtはスクレイピングによって得られた文字列 | |
gcn = GCPNaturalLanguage() | |
dic = gcn.get_entity(txt) # 単語の位置がkey, 単語がvalueにあたる。 | |
words = dic.values() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 頻度表 | |
import collections | |
import Morph | |
def create_word_hist(file, path): | |
mp = Morph(dic_path = path) | |
mp.set_file(file) | |
# mp.extract()は分かち書きされた(origin, kind)のgeneratorを返す | |
counts = collections.defaultdict(int) | |
for tup in mp.extract(): | |
counts[tup] += 1 | |
return counts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from pandas import Series, DataFrame | |
# wordsはmecabの辞書に登録されていないword | |
def save_chasen_csv(words, file=None): | |
chasen_mapping = \ | |
['表層形', '左文脈ID', '右文脈ID', 'コスト', '品詞', | |
'品詞細分類1', '品詞細分類2', '品詞細分類3', | |
'活用形', '活用型', '原形', '読み', '発音'] | |
word_series_list = [] | |
for w in words: | |
word_series = Series([None] * len(chasen_mapping), index=chasen_mapping) | |
word_series[['表層形', '品詞', '品詞細分類1', '原形']] = [w, '名詞', '一般', w] | |
word_series[['品詞細分類2', '品詞細分類3', '活用形', '活用型']] = '*' | |
word_series_list.append(word_series) | |
new_word_df = DataFrame(word_series_list) | |
if file is not None: | |
os.makedirs(os.path.dirname(file), exist_ok=True) | |
new_word_df.to_csv(file, index=False, header=False) | |
return new_word_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 頻度表 | |
# wiki_file .. http://eyepodtouch.net/?p=77 を参照して、作っておく | |
# path .. "/usr/local/lib/mecab/dic/mecab-ipadic-neologd" などMeCabのシステム辞書のpathを入れる | |
## wikipediaのfileは2.5Gくらいあるので、時間かかる | |
counts = create_word_hist(wiki_file, path) | |
# 与えられたwordsから、出現する語彙countsを除去したものを返す | |
## この関数は各自用意してください | |
new_words = extract_nonexist_words(words, counts) | |
# new_wordsはmecabの辞書に登録されていないwordのlist | |
save_chasen_csv(new_words, file = "/home/foo/bar/test_dic.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## http://qiita.com/wakisuke/items/d15b5defc1aad61cc910 を参考にした | |
# IPA辞書のutf-8に | |
% bzip2 -d mecab-ipadic-2.7.0-20070801.model.bz2 | |
% vi ./mecab-ipadic-2.7.0-20070801.model #6行目を「charset: utf-8」に書き換えてください | |
% nkf -w --overwrite ./mecab-ipadic-2.7.0-20070801.model #文字コードをutf-8へ変換 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment