Skip to content

Instantly share code, notes, and snippets.

@icoxfog417
Created November 8, 2017 06:04
Show Gist options
  • Save icoxfog417/301f8c12f92aa33a574b5f6e32b4d21c to your computer and use it in GitHub Desktop.
Save icoxfog417/301f8c12f92aa33a574b5f6e32b4d21c to your computer and use it in GitHub Desktop.
minutes analysis
import os
import math
from collections import Counter
import requests
from bs4 import BeautifulSoup
import MeCab
import matplotlib.pyplot as plt
from wordcloud import WordCloud
def get_content(url):
c = requests.get(url).content
soup = BeautifulSoup(c, "html.parser")
anchor = soup.find("a", attrs={"name": "p_honbun"})
body = anchor.find_next_sibling("span", class_="txt03")
speakers = body.find_all("strong")
contents = []
for s in speakers:
who = s.text
says = []
say = s.next_sibling
while say is not None and (say.name is None or say.name == "br"):
if say.name is None and trim(say):
says.append(trim(say))
say = say.next_sibling
c = (who, " ".join(says))
contents.append(c)
return contents
def trim(say):
s = say.strip()
s = s.replace("○", "")
return s
def tokenize(s):
mecab = MeCab.Tagger("-Owakati")
stop_words_p = os.path.join(os.path.dirname(__file__), "stop_words.txt")
with open(stop_words_p, encoding="utf-8") as f:
stop_words = f.readlines()
stop_words = [ln.strip() for ln in stop_words]
node = mecab.parseToNode(s)
result = []
while(node):
if node.surface != "":
pos_info = node.feature.split(",")
pos = pos_info[0]
pos_type = pos_info[4]
if pos in ["名詞", "動詞", "形容詞", "副詞"]:
if node.surface == "ます":
print(pos_info)
if not (pos == "動詞" and "サ変" in pos_type):
result.append(node.surface)
node = node.next
result = [w.strip() for w in result]
result = [w for w in result if w and w not in stop_words]
return result
def main(url):
contents = get_content(url)
who_said = {}
for who, say in contents:
if who not in who_said:
who_said[who] = []
who_said[who].append(say)
plt.figure(figsize=(16, 10))
font_path = "/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc"
total = len(who_said)
cols = 5
rows = int(math.ceil(total / cols))
for i, w in enumerate(who_said):
all_speaches = "".join(who_said[w])
result = tokenize(all_speaches)
counter = Counter(result)
freq = dict((k, f) for k, f in counter.most_common())
wordcloud = WordCloud(
background_color="white", font_path=font_path,
width=300, height=150).generate_from_frequencies(freq)
ax = plt.subplot(rows, cols, i + 1)
ax.imshow(wordcloud)
ax.axis("off")
ax.set_title(w)
plt.tight_layout()
plt.savefig("wc.png")
plt.show()
if __name__ == "__main__":
sample_url = "http://www.shugiin.go.jp/internet/itdb_kaigiroku.nsf/html/kaigiroku/023319220161124003.htm"
main(sample_url)
@icoxfog417
Copy link
Author

wc

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment