knknkn1162/file0.txt

## file0.txt
# 新語辞書 mecab-ipadic-neologd の追加(2回目以降は) 辞書更新
./bin/install-mecab-ipadic-neologd -n -a

## file1.txt
# terminal上では、これで使える。
mecab -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd

## file10.txt
# chasen形式の新語のcsvが存在しているディレクトリに飛ぶ
cd /home/foo/bar
# GCP natural language APIで拾った語彙のコスト自動推定
## macの場合(linuxの場合 /usr/libexec/mecab/mecab-dict-index)
/usr/local/Cellar/mecab/0.996/libexec/mecab/mecab-dict-index -m ./mecab-ipadic-2.7.0-20070801.model \
-d ./mecab-ipadic -f utf-8 -t utf-8 \
-a test_dic.csv -u new_test_dic.csv
./mecab-ipadic-2.7.0-20070801.model is not a binary model. reopen it as text mode...
reading new_test_dic.csv ...
done!

# 再び辞書を作成
/usr/local/Cellar/mecab/0.996/libexec/mecab/mecab-dict-index -m ./mecab-ipadic-2.7.0-20070801.model \
-d ./mecab-ipadic \
-f utf-8 -t utf-8 \
-u ./site.dic \
./new_test_dic.csv # 新規に追加するcsv

## file11.txt
vim /usr/local/etc/mecabrc

# ユーザ辞書への追加
# 適当な行に付け足す
userdic = /home/foo/bar/site.dic

## file12.txt
# terminal上では、これで使える。
mecab -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd

## file2.txt
import MeCab
class Morph(object):
    kinds = None
    log_stamp = 1000

    @classmethod
    def set_log(cls, num):
        cls.log_stamp = num

    @classmethod
    def set_kind(cls, *args):
        cls.kinds = args
        return cls

    @classmethod
    def reset_kind(cls):
        cls.kinds = None
        return cls

    def set_file(self, file, up_to=100000000):
        fi = codecs.open(file, 'r', 'utf-8', 'ignore')
        self.lines = fi
        return self

    def set_sentence(self, sentence):
        self.lines = sentence.split('\n')
        return self

    def set_chasen(self, s):
        import MeCab
        self.chasen = MeCab.Tagger(s)
        return self

    def set_wakati(self, s):
        import MeCab
        self.wakati = MeCab.Tagger(s)
        return self

    def __init__(self, dic_path=None):
        try:
            self.chasen = MeCab.Tagger('-Ochasen -d {}'.format(dic_path))
            self.wakati = MeCab.Tagger('-Owakati -d {}'.format(dic_path))
        except:
            self.chasen = MeCab.Tagger('-Ochasen')
            self.wakati = MeCab.Tagger('-Owakati')


    def wakatigaki(self):
        res = ''
        for line in self.lines:
            res += self.wakati.parse(line)
        return res

    def extract(self, feature=True):
        feature_flag = feature
        '''return type of list'''
        tagger = self.chasen
        for i, line in enumerate(self.lines):
            line = line.strip()
            if (i + 1) % self.log_stamp == 0:
                print('line {}'.format(i + 1))
            # 最後はEOSなので、読み飛ばす
            chunks = tagger.parse(line).splitlines()[:-1]

            for idx, chunk in enumerate(chunks):
                try:
                    # #表層形\t読み\t原型\t品詞なので、(原型、品詞)のみとり出す。
                    _surface, _yomi, origin, feature = chunk.split('\t')[:4]
                except:
                    import traceback
                    print('×', end="/")
                    continue
                origin = origin.lower()
                if Morph.kinds is None:
                    if feature_flag:
                        yield (origin, feature.split('-')[0])
                    elif not feature_flag:
                        yield origin
                    continue
                for kind in Morph.kinds:
                    if feature.startswith(kind):
                        if feature_flag:
                            yield (origin, kind)
                        elif not feature_flag:
                            yield origin
                        break
    return 0

## file3.txt
# txtを事前に用意しておく
mp = Morph(dic_path = path)
mp.set_sentence(txt)
print(list(mp.extract()))

## file4.txt
from google.cloud import language
class GCPNaturalLanguage(object):
    def __init__(self, upper=10000):
        # Instantiates a client
        self.client = language.Client()
        self.upper = upper

    def get_entity(self, text):
        length = len(text)

        if length > self.upper:
            print("{} .. too long".format(length))
            return {}

        document = self.client.document_from_text(text, language='ja')

        # Detects the sentiment of the text
        res = document.analyze_entities()
        print("{} characters => done!".format(len(text)))
        dic = {}
        for entity in res.entities:

            for m in entity.mentions:
                dic.update({m.text.begin_offset: m.text.content})

        return dic

## file5.txt
import GCPNaturalLanguage
# txtはスクレイピングによって得られた文字列
gcn = GCPNaturalLanguage()
dic = gcn.get_entity(txt) # 単語の位置がkey, 単語がvalueにあたる。
words = dic.values()

## file6.txt
# 頻度表
import collections
import Morph
def create_word_hist(file, path):
    mp = Morph(dic_path = path)
    mp.set_file(file)
    # mp.extract()は分かち書きされた(origin, kind)のgeneratorを返す
    counts = collections.defaultdict(int)
    for tup in mp.extract():
        counts[tup] += 1
  return counts

## file7.txt
import os
from pandas import Series, DataFrame

# wordsはmecabの辞書に登録されていないword
def save_chasen_csv(words, file=None):
    chasen_mapping = \
        ['表層形', '左文脈ID', '右文脈ID', 'コスト', '品詞',
         '品詞細分類1', '品詞細分類2', '品詞細分類3',
         '活用形', '活用型', '原形', '読み', '発音']

    word_series_list = []
    for w in words:
        word_series = Series([None] * len(chasen_mapping), index=chasen_mapping)
        word_series[['表層形', '品詞', '品詞細分類1', '原形']] = [w, '名詞', '一般', w]
        word_series[['品詞細分類2', '品詞細分類3', '活用形', '活用型']] = '*'
        word_series_list.append(word_series)

    new_word_df = DataFrame(word_series_list)
    if file is not None:
        os.makedirs(os.path.dirname(file), exist_ok=True)
        new_word_df.to_csv(file, index=False, header=False)

    return new_word_df

## file8.txt
# 頻度表
# wiki_file .. http://eyepodtouch.net/?p=77 を参照して、作っておく
# path .. "/usr/local/lib/mecab/dic/mecab-ipadic-neologd" などMeCabのシステム辞書のpathを入れる
## wikipediaのfileは2.5Gくらいあるので、時間かかる
counts = create_word_hist(wiki_file, path)
# 与えられたwordsから、出現する語彙countsを除去したものを返す
## この関数は各自用意してください
new_words = extract_nonexist_words(words, counts)
# new_wordsはmecabの辞書に登録されていないwordのlist
save_chasen_csv(new_words, file = "/home/foo/bar/test_dic.csv")

## file9.txt
## http://qiita.com/wakisuke/items/d15b5defc1aad61cc910 を参考にした
# IPA辞書のutf-8に
% bzip2 -d mecab-ipadic-2.7.0-20070801.model.bz2
% vi ./mecab-ipadic-2.7.0-20070801.model #６行目を「charset: utf-8」に書き換えてください
% nkf -w --overwrite ./mecab-ipadic-2.7.0-20070801.model #文字コードをutf-8へ変換
	# 新語辞書 mecab-ipadic-neologd の追加(2回目以降は) 辞書更新
	./bin/install-mecab-ipadic-neologd -n -a
	# terminal上では、これで使える。
	mecab -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd
	# chasen形式の新語のcsvが存在しているディレクトリに飛ぶ
	cd /home/foo/bar
	# GCP natural language APIで拾った語彙のコスト自動推定
	## macの場合(linuxの場合 /usr/libexec/mecab/mecab-dict-index)
	/usr/local/Cellar/mecab/0.996/libexec/mecab/mecab-dict-index -m ./mecab-ipadic-2.7.0-20070801.model \
	-d ./mecab-ipadic -f utf-8 -t utf-8 \
	-a test_dic.csv -u new_test_dic.csv
	./mecab-ipadic-2.7.0-20070801.model is not a binary model. reopen it as text mode...
	reading new_test_dic.csv ...
	done!

	# 再び辞書を作成
	/usr/local/Cellar/mecab/0.996/libexec/mecab/mecab-dict-index -m ./mecab-ipadic-2.7.0-20070801.model \
	-d ./mecab-ipadic \
	-f utf-8 -t utf-8 \
	-u ./site.dic \
	./new_test_dic.csv # 新規に追加するcsv
	vim /usr/local/etc/mecabrc

	# ユーザ辞書への追加
	# 適当な行に付け足す
	userdic = /home/foo/bar/site.dic
	import MeCab
	class Morph(object):
	kinds = None
	log_stamp = 1000

	@classmethod
	def set_log(cls, num):
	cls.log_stamp = num

	@classmethod
	def set_kind(cls, *args):
	cls.kinds = args
	return cls

	@classmethod
	def reset_kind(cls):
	cls.kinds = None
	return cls

	def set_file(self, file, up_to=100000000):
	fi = codecs.open(file, 'r', 'utf-8', 'ignore')
	self.lines = fi
	return self

	def set_sentence(self, sentence):
	self.lines = sentence.split('\n')
	return self

	def set_chasen(self, s):
	import MeCab
	self.chasen = MeCab.Tagger(s)
	return self

	def set_wakati(self, s):
	import MeCab
	self.wakati = MeCab.Tagger(s)
	return self

	def __init__(self, dic_path=None):
	try:
	self.chasen = MeCab.Tagger('-Ochasen -d {}'.format(dic_path))
	self.wakati = MeCab.Tagger('-Owakati -d {}'.format(dic_path))
	except:
	self.chasen = MeCab.Tagger('-Ochasen')
	self.wakati = MeCab.Tagger('-Owakati')


	def wakatigaki(self):
	res = ''
	for line in self.lines:
	res += self.wakati.parse(line)
	return res

	def extract(self, feature=True):
	feature_flag = feature
	'''return type of list'''
	tagger = self.chasen
	for i, line in enumerate(self.lines):
	line = line.strip()
	if (i + 1) % self.log_stamp == 0:
	print('line {}'.format(i + 1))
	# 最後はEOSなので、読み飛ばす
	chunks = tagger.parse(line).splitlines()[:-1]

	for idx, chunk in enumerate(chunks):
	try:
	# #表層形\t読み\t原型\t品詞なので、(原型、品詞)のみとり出す。
	_surface, _yomi, origin, feature = chunk.split('\t')[:4]
	except:
	import traceback
	print('×', end="/")
	continue
	origin = origin.lower()
	if Morph.kinds is None:
	if feature_flag:
	yield (origin, feature.split('-')[0])
	elif not feature_flag:
	yield origin
	continue
	for kind in Morph.kinds:
	if feature.startswith(kind):
	if feature_flag:
	yield (origin, kind)
	elif not feature_flag:
	yield origin
	break
	return 0
	# txtを事前に用意しておく
	mp = Morph(dic_path = path)
	mp.set_sentence(txt)
	print(list(mp.extract()))
	from google.cloud import language
	class GCPNaturalLanguage(object):
	def __init__(self, upper=10000):
	# Instantiates a client
	self.client = language.Client()
	self.upper = upper

	def get_entity(self, text):
	length = len(text)

	if length > self.upper:
	print("{} .. too long".format(length))
	return {}

	document = self.client.document_from_text(text, language='ja')

	# Detects the sentiment of the text
	res = document.analyze_entities()
	print("{} characters => done!".format(len(text)))
	dic = {}
	for entity in res.entities:

	for m in entity.mentions:
	dic.update({m.text.begin_offset: m.text.content})

	return dic
	import GCPNaturalLanguage
	# txtはスクレイピングによって得られた文字列
	gcn = GCPNaturalLanguage()
	dic = gcn.get_entity(txt) # 単語の位置がkey, 単語がvalueにあたる。
	words = dic.values()
	# 頻度表
	import collections
	import Morph
	def create_word_hist(file, path):
	mp = Morph(dic_path = path)
	mp.set_file(file)
	# mp.extract()は分かち書きされた(origin, kind)のgeneratorを返す
	counts = collections.defaultdict(int)
	for tup in mp.extract():
	counts[tup] += 1
	return counts
	import os
	from pandas import Series, DataFrame

	# wordsはmecabの辞書に登録されていないword
	def save_chasen_csv(words, file=None):
	chasen_mapping = \
	['表層形', '左文脈ID', '右文脈ID', 'コスト', '品詞',
	'品詞細分類1', '品詞細分類2', '品詞細分類3',
	'活用形', '活用型', '原形', '読み', '発音']

	word_series_list = []
	for w in words:
	word_series = Series([None] * len(chasen_mapping), index=chasen_mapping)
	word_series[['表層形', '品詞', '品詞細分類1', '原形']] = [w, '名詞', '一般', w]
	word_series[['品詞細分類2', '品詞細分類3', '活用形', '活用型']] = '*'
	word_series_list.append(word_series)

	new_word_df = DataFrame(word_series_list)
	if file is not None:
	os.makedirs(os.path.dirname(file), exist_ok=True)
	new_word_df.to_csv(file, index=False, header=False)

	return new_word_df