Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@ftnext
Created January 12, 2020 11:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ftnext/2e14fb57c96ca276ee4d28eccfcecd96 to your computer and use it in GitHub Desktop.
Save ftnext/2e14fb57c96ca276ee4d28eccfcecd96 to your computer and use it in GitHub Desktop.
「Janome ではじめるテキストマイニング」02, 03の写経(WordCloud) ref:https://github.com/mocobeta/janome-tutorial
from janome.tokenizer import Tokenizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
def split_text(src, out):
t = Tokenizer()
with open(src, encoding='utf8') as fin, \
open(out, 'w', encoding='utf8') as fout:
for line in fin:
tokens = t.tokenize(line, wakati=True) # surfaceのリストを作る
fout.write(f'{" ".join(tokens)}\n')
# text = fin.read()
# # lines = text.split('\n') # 空白文字で区切る -> 全角スペースも反応 (splitlinesもある)
# lines = text.splitlines()
# splitted = []
# for line in lines:
# tokens = []
# if line:
# tokens = [token.surface for token in t.tokenize(line)]
# splitted_line = ' '.join(tokens)
# splitted.append(splitted_line)
# fout.write('\n'.join(splitted))
def show_wordcloud(file):
with open(file, encoding='utf8') as f:
text = f.read()
wordcloud = WordCloud(
font_path='ipagp.ttf',
background_color='white',
width=1024,
height=674
).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
# plt.figure()
# plt.show()
plt.savefig('wordcloud.png')
from janome.analyzer import Analyzer
from janome.tokenfilter import (
# CompoundNounFilter,
ExtractAttributeFilter,
POSKeepFilter,
POSStopFilter,
TokenCountFilter,
TokenFilter
)
from janome.tokenizer import Tokenizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
MAX_WORDS = 20
class StopWordFilter(TokenFilter):
def __init__(self, word_list=None, word_list_file=''):
if word_list is None:
word_list = []
assert isinstance(word_list, list), 'ストップワードがlistで渡されていません'
self._word_list = []
self._word_list += word_list
if word_list_file:
with open(word_list_file, encoding='utf8') as f:
words_in_file = [word.strip() for word in f]
# self._word_list += list(f) # 末尾に改行が付く
self._word_list += words_in_file # 処理速度的にsetにしたほうがよさそう
def apply(self, tokens):
for token in tokens:
if token.base_form not in self._word_list:
yield token
def split_text(src, out):
token_filters = [
# POSKeepFilter(['名詞', '動詞', 'カスタム名詞']), # 想定解を試すためコメントアウト
POSKeepFilter(['名詞', 'カスタム名詞', '形容詞', '形容動詞', '感動詞']),
POSStopFilter(['名詞,非自立','名詞,代名詞']), # stopwordと同じ効果(挙げる必要がないので手間が少ない)
#StopWordFilter(word_list_file='stop_words.txt'),
ExtractAttributeFilter('base_form')
]
a = Analyzer(
tokenizer=Tokenizer('udic.csv', udic_type='simpledic'),
token_filters=token_filters)
with open(src, encoding='utf8') as fin, \
open(out, 'w', encoding='utf8') as fout:
for line in fin:
tokens = a.analyze(line) # フィルター後のbase_formのgenerator
fout.write(f'{" ".join(list(tokens))}\n')
def wc(file, pos=None, word_list_file=''):
if pos is None:
pos = []
token_filters = [
POSKeepFilter(pos),
StopWordFilter(word_list_file=word_list_file),
TokenCountFilter('base_form', sorted=True)
]
a = Analyzer(
tokenizer=Tokenizer('udic.csv', udic_type='simpledic'),
token_filters=token_filters)
word_count_dict = {}
with open(file, encoding='utf8') as f:
text = f.read()
# for line in f:
# for key, value in a.analyze(line):
# if key not in word_count_dict:
# word_count_dict[key] = 0
# word_count_dict[key] += value
# word_count_list = [(key, value) for key, value in word_count_dict.items()]
# sorted_word_count = sorted(
# word_count_list, key=lambda x: x[1], reverse=True)
# return sorted_word_count[:MAX_WORDS]
return list(a.analyze(text))[:MAX_WORDS]
# 3-1の状態
# def wc(file):
# token_filters = [TokenCountFilter()] # sorted=Trueの指定
# a = Analyzer(tokenizer=Tokenizer(), token_filters=token_filters)
# word_count_dict = {}
# with open(file, encoding='utf8') as f:
# for line in f:
# for key, value in a.analyze(line):
# if key not in word_count_dict:
# word_count_dict[key] = 0
# word_count_dict[key] += value
# word_count_list = [(key, value) for key, value in word_count_dict.items()]
# sorted_word_count = sorted(
# word_count_list, key=lambda x: x[1], reverse=True)
# return sorted_word_count[:MAX_WORDS]
# text = 'すもももももももものうち'
# token_filters = [TokenCountFilter()]
# a = Analyzer(tokenizer=Tokenizer(), token_filters=token_filters)
# for key, value in a.analyze(text):
# print(f'{key}: {value}')
def show_wordcloud(file):
with open(file, encoding='utf8') as f:
text = f.read()
wordcloud = WordCloud(
font_path='ipagp.ttf',
background_color='white',
width=1024,
height=674,
collocations=False
).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
# plt.figure()
# plt.show()
plt.savefig('wordcloud.png')
する
いる
なる
言う
れる
てる
ある
おる
がる
できる
られる
いう
思う
よう
それ
ほう
ところ
そっち
こっち
とき
こと
そこ
さん
だれ
なん
そう
ここ
みんな
来る
行く
又三郎 カスタム名詞 マタサブロウ
嘉助 カスタム名詞 カスケ
佐太郎 カスタム名詞 サタロウ
耕助 カスタム名詞 コウスケ
一郎 カスタム名詞 イチロウ
雨三郎 カスタム名詞 アメサブロウ
一年生 カスタム名詞 イチネンセイ
二年生 カスタム名詞 ニネンセイ
三年生 カスタム名詞 サンネンセイ
四年生 カスタム名詞 ヨネンセイ
五年生 カスタム名詞 ゴネンセイ
六年生 カスタム名詞 ロクネンセイ
木ペン カスタム名詞 キペン
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment