Created
November 8, 2017 06:04
-
-
Save icoxfog417/301f8c12f92aa33a574b5f6e32b4d21c to your computer and use it in GitHub Desktop.
minutes analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import math | |
from collections import Counter | |
import requests | |
from bs4 import BeautifulSoup | |
import MeCab | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
def get_content(url): | |
c = requests.get(url).content | |
soup = BeautifulSoup(c, "html.parser") | |
anchor = soup.find("a", attrs={"name": "p_honbun"}) | |
body = anchor.find_next_sibling("span", class_="txt03") | |
speakers = body.find_all("strong") | |
contents = [] | |
for s in speakers: | |
who = s.text | |
says = [] | |
say = s.next_sibling | |
while say is not None and (say.name is None or say.name == "br"): | |
if say.name is None and trim(say): | |
says.append(trim(say)) | |
say = say.next_sibling | |
c = (who, " ".join(says)) | |
contents.append(c) | |
return contents | |
def trim(say): | |
s = say.strip() | |
s = s.replace("○", "") | |
return s | |
def tokenize(s): | |
mecab = MeCab.Tagger("-Owakati") | |
stop_words_p = os.path.join(os.path.dirname(__file__), "stop_words.txt") | |
with open(stop_words_p, encoding="utf-8") as f: | |
stop_words = f.readlines() | |
stop_words = [ln.strip() for ln in stop_words] | |
node = mecab.parseToNode(s) | |
result = [] | |
while(node): | |
if node.surface != "": | |
pos_info = node.feature.split(",") | |
pos = pos_info[0] | |
pos_type = pos_info[4] | |
if pos in ["名詞", "動詞", "形容詞", "副詞"]: | |
if node.surface == "ます": | |
print(pos_info) | |
if not (pos == "動詞" and "サ変" in pos_type): | |
result.append(node.surface) | |
node = node.next | |
result = [w.strip() for w in result] | |
result = [w for w in result if w and w not in stop_words] | |
return result | |
def main(url): | |
contents = get_content(url) | |
who_said = {} | |
for who, say in contents: | |
if who not in who_said: | |
who_said[who] = [] | |
who_said[who].append(say) | |
plt.figure(figsize=(16, 10)) | |
font_path = "/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc" | |
total = len(who_said) | |
cols = 5 | |
rows = int(math.ceil(total / cols)) | |
for i, w in enumerate(who_said): | |
all_speaches = "".join(who_said[w]) | |
result = tokenize(all_speaches) | |
counter = Counter(result) | |
freq = dict((k, f) for k, f in counter.most_common()) | |
wordcloud = WordCloud( | |
background_color="white", font_path=font_path, | |
width=300, height=150).generate_from_frequencies(freq) | |
ax = plt.subplot(rows, cols, i + 1) | |
ax.imshow(wordcloud) | |
ax.axis("off") | |
ax.set_title(w) | |
plt.tight_layout() | |
plt.savefig("wc.png") | |
plt.show() | |
if __name__ == "__main__": | |
sample_url = "http://www.shugiin.go.jp/internet/itdb_kaigiroku.nsf/html/kaigiroku/023319220161124003.htm" | |
main(sample_url) |
Author
icoxfog417
commented
Nov 8, 2017
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment