Skip to content

Instantly share code, notes, and snippets.

@knknkn1162
Last active May 28, 2019 05:45
Show Gist options
  • Save knknkn1162/c81bcd15e0e4f20304559f25a58c38fc to your computer and use it in GitHub Desktop.
Save knknkn1162/c81bcd15e0e4f20304559f25a58c38fc to your computer and use it in GitHub Desktop.
MeCabをブーストさせよう ref: https://qiita.com/knknkn1162/items/8c12f42dd167aae01c02
# 新語辞書 mecab-ipadic-neologd の追加(2回目以降は) 辞書更新
./bin/install-mecab-ipadic-neologd -n -a
# terminal上では、これで使える。
mecab -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd
# chasen形式の新語のcsvが存在しているディレクトリに飛ぶ
cd /home/foo/bar
# GCP natural language APIで拾った語彙のコスト自動推定
## macの場合(linuxの場合 /usr/libexec/mecab/mecab-dict-index)
/usr/local/Cellar/mecab/0.996/libexec/mecab/mecab-dict-index -m ./mecab-ipadic-2.7.0-20070801.model \
-d ./mecab-ipadic -f utf-8 -t utf-8 \
-a test_dic.csv -u new_test_dic.csv
./mecab-ipadic-2.7.0-20070801.model is not a binary model. reopen it as text mode...
reading new_test_dic.csv ...
done!
# 再び辞書を作成
/usr/local/Cellar/mecab/0.996/libexec/mecab/mecab-dict-index -m ./mecab-ipadic-2.7.0-20070801.model \
-d ./mecab-ipadic \
-f utf-8 -t utf-8 \
-u ./site.dic \
./new_test_dic.csv # 新規に追加するcsv
vim /usr/local/etc/mecabrc
# ユーザ辞書への追加
# 適当な行に付け足す
userdic = /home/foo/bar/site.dic
# terminal上では、これで使える。
mecab -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd
import MeCab
class Morph(object):
kinds = None
log_stamp = 1000
@classmethod
def set_log(cls, num):
cls.log_stamp = num
@classmethod
def set_kind(cls, *args):
cls.kinds = args
return cls
@classmethod
def reset_kind(cls):
cls.kinds = None
return cls
def set_file(self, file, up_to=100000000):
fi = codecs.open(file, 'r', 'utf-8', 'ignore')
self.lines = fi
return self
def set_sentence(self, sentence):
self.lines = sentence.split('\n')
return self
def set_chasen(self, s):
import MeCab
self.chasen = MeCab.Tagger(s)
return self
def set_wakati(self, s):
import MeCab
self.wakati = MeCab.Tagger(s)
return self
def __init__(self, dic_path=None):
try:
self.chasen = MeCab.Tagger('-Ochasen -d {}'.format(dic_path))
self.wakati = MeCab.Tagger('-Owakati -d {}'.format(dic_path))
except:
self.chasen = MeCab.Tagger('-Ochasen')
self.wakati = MeCab.Tagger('-Owakati')
def wakatigaki(self):
res = ''
for line in self.lines:
res += self.wakati.parse(line)
return res
def extract(self, feature=True):
feature_flag = feature
'''return type of list'''
tagger = self.chasen
for i, line in enumerate(self.lines):
line = line.strip()
if (i + 1) % self.log_stamp == 0:
print('line {}'.format(i + 1))
# 最後はEOSなので、読み飛ばす
chunks = tagger.parse(line).splitlines()[:-1]
for idx, chunk in enumerate(chunks):
try:
# #表層形\t読み\t原型\t品詞なので、(原型、品詞)のみとり出す。
_surface, _yomi, origin, feature = chunk.split('\t')[:4]
except:
import traceback
print('×', end="/")
continue
origin = origin.lower()
if Morph.kinds is None:
if feature_flag:
yield (origin, feature.split('-')[0])
elif not feature_flag:
yield origin
continue
for kind in Morph.kinds:
if feature.startswith(kind):
if feature_flag:
yield (origin, kind)
elif not feature_flag:
yield origin
break
return 0
# txtを事前に用意しておく
mp = Morph(dic_path = path)
mp.set_sentence(txt)
print(list(mp.extract()))
from google.cloud import language
class GCPNaturalLanguage(object):
def __init__(self, upper=10000):
# Instantiates a client
self.client = language.Client()
self.upper = upper
def get_entity(self, text):
length = len(text)
if length > self.upper:
print("{} .. too long".format(length))
return {}
document = self.client.document_from_text(text, language='ja')
# Detects the sentiment of the text
res = document.analyze_entities()
print("{} characters => done!".format(len(text)))
dic = {}
for entity in res.entities:
for m in entity.mentions:
dic.update({m.text.begin_offset: m.text.content})
return dic
import GCPNaturalLanguage
# txtはスクレイピングによって得られた文字列
gcn = GCPNaturalLanguage()
dic = gcn.get_entity(txt) # 単語の位置がkey, 単語がvalueにあたる。
words = dic.values()
# 頻度表
import collections
import Morph
def create_word_hist(file, path):
mp = Morph(dic_path = path)
mp.set_file(file)
# mp.extract()は分かち書きされた(origin, kind)のgeneratorを返す
counts = collections.defaultdict(int)
for tup in mp.extract():
counts[tup] += 1
return counts
import os
from pandas import Series, DataFrame
# wordsはmecabの辞書に登録されていないword
def save_chasen_csv(words, file=None):
chasen_mapping = \
['表層形', '左文脈ID', '右文脈ID', 'コスト', '品詞',
'品詞細分類1', '品詞細分類2', '品詞細分類3',
'活用形', '活用型', '原形', '読み', '発音']
word_series_list = []
for w in words:
word_series = Series([None] * len(chasen_mapping), index=chasen_mapping)
word_series[['表層形', '品詞', '品詞細分類1', '原形']] = [w, '名詞', '一般', w]
word_series[['品詞細分類2', '品詞細分類3', '活用形', '活用型']] = '*'
word_series_list.append(word_series)
new_word_df = DataFrame(word_series_list)
if file is not None:
os.makedirs(os.path.dirname(file), exist_ok=True)
new_word_df.to_csv(file, index=False, header=False)
return new_word_df
# 頻度表
# wiki_file .. http://eyepodtouch.net/?p=77 を参照して、作っておく
# path .. "/usr/local/lib/mecab/dic/mecab-ipadic-neologd" などMeCabのシステム辞書のpathを入れる
## wikipediaのfileは2.5Gくらいあるので、時間かかる
counts = create_word_hist(wiki_file, path)
# 与えられたwordsから、出現する語彙countsを除去したものを返す
## この関数は各自用意してください
new_words = extract_nonexist_words(words, counts)
# new_wordsはmecabの辞書に登録されていないwordのlist
save_chasen_csv(new_words, file = "/home/foo/bar/test_dic.csv")
## http://qiita.com/wakisuke/items/d15b5defc1aad61cc910 を参考にした
# IPA辞書のutf-8に
% bzip2 -d mecab-ipadic-2.7.0-20070801.model.bz2
% vi ./mecab-ipadic-2.7.0-20070801.model #6行目を「charset: utf-8」に書き換えてください
% nkf -w --overwrite ./mecab-ipadic-2.7.0-20070801.model #文字コードをutf-8へ変換
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment