Created
May 2, 2015 05:14
-
-
Save matsuken92/2ae497699c7b8f9bde43 to your computer and use it in GitHub Desktop.
word cloud
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf-8 | |
%matplotlib inline | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
from bs4 import BeautifulSoup | |
import requests | |
import MeCab as mc | |
def mecab_analysis(text): | |
t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd/') | |
enc_text = text.encode('utf-8') | |
node = t.parseToNode(enc_text) | |
output = [] | |
while(node): | |
if node.surface != "": # ヘッダとフッタを除外 | |
word_type = node.feature.split(",")[0] | |
if word_type in ["形容詞", "動詞","名詞", "副詞"]: | |
output.append(node.surface) | |
node = node.next | |
if node is None: | |
break | |
return output | |
def get_wordlist_from_QiitaURL(url): | |
res = requests.get(url) | |
soup = BeautifulSoup(res.text) | |
text = soup.body.section.get_text().replace('\n','').replace('\t','') | |
return mecab_analysis(text) | |
def create_wordcloud(text): | |
# 環境に合わせてフォントのパスを指定する。 | |
#fpath = "/System/Library/Fonts/HelveticaNeue-UltraLight.otf" | |
fpath = "/Library/Fonts/ヒラギノ角ゴ Pro W3.otf" | |
# ストップワードの設定 | |
stop_words = [ u'てる', u'いる', u'なる', u'れる', u'する', u'ある', u'こと', u'これ', u'さん', u'して', \ | |
u'くれる', u'やる', u'くださる', u'そう', u'せる', u'した', u'思う', \ | |
u'それ', u'ここ', u'ちゃん', u'くん', u'', u'て',u'に',u'を',u'は',u'の', u'が', u'と', u'た', u'し', u'で', \ | |
u'ない', u'も', u'な', u'い', u'か', u'ので', u'よう', u''] | |
wordcloud = WordCloud(background_color="white",font_path=fpath, width=900, height=500, \ | |
stopwords=set(stop_words)).generate(text) | |
plt.figure(figsize=(15,12)) | |
plt.imshow(wordcloud) | |
plt.axis("off") | |
plt.show() | |
url = "http://qiita.com/t_saeko/items/2b475b8657c826abc114" | |
wordlist = get_wordlist_from_QiitaURL(url) | |
create_wordcloud(" ".join(wordlist).decode('utf-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment