Created
January 12, 2020 11:39
-
-
Save ftnext/2e14fb57c96ca276ee4d28eccfcecd96 to your computer and use it in GitHub Desktop.
「Janome ではじめるテキストマイニング」02, 03の写経(WordCloud) ref:https://github.com/mocobeta/janome-tutorial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from janome.tokenizer import Tokenizer | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
def split_text(src, out): | |
t = Tokenizer() | |
with open(src, encoding='utf8') as fin, \ | |
open(out, 'w', encoding='utf8') as fout: | |
for line in fin: | |
tokens = t.tokenize(line, wakati=True) # surfaceのリストを作る | |
fout.write(f'{" ".join(tokens)}\n') | |
# text = fin.read() | |
# # lines = text.split('\n') # 空白文字で区切る -> 全角スペースも反応 (splitlinesもある) | |
# lines = text.splitlines() | |
# splitted = [] | |
# for line in lines: | |
# tokens = [] | |
# if line: | |
# tokens = [token.surface for token in t.tokenize(line)] | |
# splitted_line = ' '.join(tokens) | |
# splitted.append(splitted_line) | |
# fout.write('\n'.join(splitted)) | |
def show_wordcloud(file): | |
with open(file, encoding='utf8') as f: | |
text = f.read() | |
wordcloud = WordCloud( | |
font_path='ipagp.ttf', | |
background_color='white', | |
width=1024, | |
height=674 | |
).generate(text) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
# plt.figure() | |
# plt.show() | |
plt.savefig('wordcloud.png') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from janome.analyzer import Analyzer | |
from janome.tokenfilter import ( | |
# CompoundNounFilter, | |
ExtractAttributeFilter, | |
POSKeepFilter, | |
POSStopFilter, | |
TokenCountFilter, | |
TokenFilter | |
) | |
from janome.tokenizer import Tokenizer | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
MAX_WORDS = 20 | |
class StopWordFilter(TokenFilter): | |
def __init__(self, word_list=None, word_list_file=''): | |
if word_list is None: | |
word_list = [] | |
assert isinstance(word_list, list), 'ストップワードがlistで渡されていません' | |
self._word_list = [] | |
self._word_list += word_list | |
if word_list_file: | |
with open(word_list_file, encoding='utf8') as f: | |
words_in_file = [word.strip() for word in f] | |
# self._word_list += list(f) # 末尾に改行が付く | |
self._word_list += words_in_file # 処理速度的にsetにしたほうがよさそう | |
def apply(self, tokens): | |
for token in tokens: | |
if token.base_form not in self._word_list: | |
yield token | |
def split_text(src, out): | |
token_filters = [ | |
# POSKeepFilter(['名詞', '動詞', 'カスタム名詞']), # 想定解を試すためコメントアウト | |
POSKeepFilter(['名詞', 'カスタム名詞', '形容詞', '形容動詞', '感動詞']), | |
POSStopFilter(['名詞,非自立','名詞,代名詞']), # stopwordと同じ効果(挙げる必要がないので手間が少ない) | |
#StopWordFilter(word_list_file='stop_words.txt'), | |
ExtractAttributeFilter('base_form') | |
] | |
a = Analyzer( | |
tokenizer=Tokenizer('udic.csv', udic_type='simpledic'), | |
token_filters=token_filters) | |
with open(src, encoding='utf8') as fin, \ | |
open(out, 'w', encoding='utf8') as fout: | |
for line in fin: | |
tokens = a.analyze(line) # フィルター後のbase_formのgenerator | |
fout.write(f'{" ".join(list(tokens))}\n') | |
def wc(file, pos=None, word_list_file=''): | |
if pos is None: | |
pos = [] | |
token_filters = [ | |
POSKeepFilter(pos), | |
StopWordFilter(word_list_file=word_list_file), | |
TokenCountFilter('base_form', sorted=True) | |
] | |
a = Analyzer( | |
tokenizer=Tokenizer('udic.csv', udic_type='simpledic'), | |
token_filters=token_filters) | |
word_count_dict = {} | |
with open(file, encoding='utf8') as f: | |
text = f.read() | |
# for line in f: | |
# for key, value in a.analyze(line): | |
# if key not in word_count_dict: | |
# word_count_dict[key] = 0 | |
# word_count_dict[key] += value | |
# word_count_list = [(key, value) for key, value in word_count_dict.items()] | |
# sorted_word_count = sorted( | |
# word_count_list, key=lambda x: x[1], reverse=True) | |
# return sorted_word_count[:MAX_WORDS] | |
return list(a.analyze(text))[:MAX_WORDS] | |
# 3-1の状態 | |
# def wc(file): | |
# token_filters = [TokenCountFilter()] # sorted=Trueの指定 | |
# a = Analyzer(tokenizer=Tokenizer(), token_filters=token_filters) | |
# word_count_dict = {} | |
# with open(file, encoding='utf8') as f: | |
# for line in f: | |
# for key, value in a.analyze(line): | |
# if key not in word_count_dict: | |
# word_count_dict[key] = 0 | |
# word_count_dict[key] += value | |
# word_count_list = [(key, value) for key, value in word_count_dict.items()] | |
# sorted_word_count = sorted( | |
# word_count_list, key=lambda x: x[1], reverse=True) | |
# return sorted_word_count[:MAX_WORDS] | |
# text = 'すもももももももものうち' | |
# token_filters = [TokenCountFilter()] | |
# a = Analyzer(tokenizer=Tokenizer(), token_filters=token_filters) | |
# for key, value in a.analyze(text): | |
# print(f'{key}: {value}') | |
def show_wordcloud(file): | |
with open(file, encoding='utf8') as f: | |
text = f.read() | |
wordcloud = WordCloud( | |
font_path='ipagp.ttf', | |
background_color='white', | |
width=1024, | |
height=674, | |
collocations=False | |
).generate(text) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
# plt.figure() | |
# plt.show() | |
plt.savefig('wordcloud.png') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
する | |
いる | |
なる | |
言う | |
れる | |
てる | |
ある | |
おる | |
がる | |
できる | |
られる | |
いう | |
思う | |
の | |
よう | |
それ | |
ほう | |
人 | |
一 | |
二 | |
三 | |
中 | |
ん | |
ところ | |
そっち | |
こっち | |
とき | |
こと | |
そこ | |
さん | |
だれ | |
なん | |
そう | |
ここ | |
みんな | |
来る | |
行く |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
又三郎 | カスタム名詞 | マタサブロウ | |
---|---|---|---|
嘉助 | カスタム名詞 | カスケ | |
佐太郎 | カスタム名詞 | サタロウ | |
耕助 | カスタム名詞 | コウスケ | |
一郎 | カスタム名詞 | イチロウ | |
雨三郎 | カスタム名詞 | アメサブロウ | |
一年生 | カスタム名詞 | イチネンセイ | |
二年生 | カスタム名詞 | ニネンセイ | |
三年生 | カスタム名詞 | サンネンセイ | |
四年生 | カスタム名詞 | ヨネンセイ | |
五年生 | カスタム名詞 | ゴネンセイ | |
六年生 | カスタム名詞 | ロクネンセイ | |
木ペン | カスタム名詞 | キペン |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment