Skip to content

Instantly share code, notes, and snippets.

@hwada

hwada/wordcloud.py

Created Feb 11, 2019
Embed
What would you like to do?
#coding:utf-8
import csv
from janome.tokenizer import Tokenizer
from wordcloud import WordCloud
from collections import defaultdict
import datetime
exclude_words = [u'これ', u'こと', u'そう', u'それ', u'ところ', u'みたい', u'よう', u'さん',
'http', 'https', 'www', 'to', 'htn', 'co', 'com', 'jp', 'ly', 'tw', 'RT', 'tinyurl', 'Bookmarking']
# 指定年の Tweet 一覧を取得 (Twitter からダウンロードした tweet.csv を使用)
def get_tweets(year):
import csv
with open('./tweets/tweets.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f, delimiter=',')
next(reader)
for row in reader:
if len(row) <= 5:
continue
dt = datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S +0000')
content = row[5]
if dt.year == year:
yield content
# 名詞だけ抽出
def get_words(texts):
t = Tokenizer()
words = []
for text in texts:
tokens = t.tokenize(text)
for token in tokens:
pos = token.part_of_speech.split(',')[0]
if pos == '名詞':
words.append(token.base_form.strip())
return words
for year in range(2007, 2018):
words = get_words(get_tweets(year))
text = ' '.join(words)
wordcloud = WordCloud(background_color='white', font_path='NotoSansCJKjp-Regular.otf',
stopwords=exclude_words, collocations=False, width=800, height=800).generate(text)
wordcloud.to_file(f'./{year}.png')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment