icoxfog417/minutes_tokenizer.py

## minutes_tokenizer.py
import os
import math
from collections import Counter
import requests
from bs4 import BeautifulSoup
import MeCab
import matplotlib.pyplot as plt
from wordcloud import WordCloud


def get_content(url):
    c = requests.get(url).content
    soup = BeautifulSoup(c, "html.parser")
    anchor = soup.find("a", attrs={"name": "p_honbun"})
    body = anchor.find_next_sibling("span", class_="txt03")
    speakers = body.find_all("strong")
    contents = []
    for s in speakers:
        who = s.text
        says = []
        say = s.next_sibling
        while say is not None and (say.name is None or say.name == "br"):
            if say.name is None and trim(say):
                says.append(trim(say))
            say = say.next_sibling
        c = (who, " ".join(says))
        contents.append(c)

    return contents


def trim(say):
    s = say.strip()
    s = s.replace("○", "")
    return s


def tokenize(s):
    mecab = MeCab.Tagger("-Owakati")
    stop_words_p = os.path.join(os.path.dirname(__file__), "stop_words.txt")
    with open(stop_words_p, encoding="utf-8") as f:
        stop_words = f.readlines()
        stop_words = [ln.strip() for ln in stop_words]
    node = mecab.parseToNode(s)
    result = []
    while(node):
        if node.surface != "":
            pos_info = node.feature.split(",")
            pos = pos_info[0]
            pos_type = pos_info[4]
            if pos in ["名詞", "動詞", "形容詞", "副詞"]:
                if node.surface == "ます":
                    print(pos_info)
                if not (pos == "動詞" and "サ変" in pos_type):
                    result.append(node.surface)
        node = node.next
    result = [w.strip() for w in result]
    result = [w for w in result if w and w not in stop_words]
    return result


def main(url):
    contents = get_content(url)

    who_said = {}
    for who, say in contents:
        if who not in who_said:
            who_said[who] = []
        who_said[who].append(say)

    plt.figure(figsize=(16, 10))
    font_path = "/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc"
    total = len(who_said)
    cols = 5
    rows = int(math.ceil(total / cols))
    for i, w in enumerate(who_said):
        all_speaches = "".join(who_said[w])
        result = tokenize(all_speaches)
        counter = Counter(result)
        freq = dict((k, f) for k, f in counter.most_common())
        wordcloud = WordCloud(
                        background_color="white", font_path=font_path,
                        width=300, height=150).generate_from_frequencies(freq)

        ax = plt.subplot(rows, cols, i + 1)
        ax.imshow(wordcloud)
        ax.axis("off")
        ax.set_title(w)

    plt.tight_layout()
    plt.savefig("wc.png")
    plt.show()


if __name__ == "__main__":
    sample_url = "http://www.shugiin.go.jp/internet/itdb_kaigiroku.nsf/html/kaigiroku/023319220161124003.htm"
    main(sample_url)
	import os
	import math
	from collections import Counter
	import requests
	from bs4 import BeautifulSoup
	import MeCab
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud


	def get_content(url):
	c = requests.get(url).content
	soup = BeautifulSoup(c, "html.parser")
	anchor = soup.find("a", attrs={"name": "p_honbun"})
	body = anchor.find_next_sibling("span", class_="txt03")
	speakers = body.find_all("strong")
	contents = []
	for s in speakers:
	who = s.text
	says = []
	say = s.next_sibling
	while say is not None and (say.name is None or say.name == "br"):
	if say.name is None and trim(say):
	says.append(trim(say))
	say = say.next_sibling
	c = (who, " ".join(says))
	contents.append(c)

	return contents


	def trim(say):
	s = say.strip()
	s = s.replace("○", "")
	return s


	def tokenize(s):
	mecab = MeCab.Tagger("-Owakati")
	stop_words_p = os.path.join(os.path.dirname(__file__), "stop_words.txt")
	with open(stop_words_p, encoding="utf-8") as f:
	stop_words = f.readlines()
	stop_words = [ln.strip() for ln in stop_words]
	node = mecab.parseToNode(s)
	result = []
	while(node):
	if node.surface != "":
	pos_info = node.feature.split(",")
	pos = pos_info[0]
	pos_type = pos_info[4]
	if pos in ["名詞", "動詞", "形容詞", "副詞"]:
	if node.surface == "ます":
	print(pos_info)
	if not (pos == "動詞" and "サ変" in pos_type):
	result.append(node.surface)
	node = node.next
	result = [w.strip() for w in result]
	result = [w for w in result if w and w not in stop_words]
	return result


	def main(url):
	contents = get_content(url)

	who_said = {}
	for who, say in contents:
	if who not in who_said:
	who_said[who] = []
	who_said[who].append(say)

	plt.figure(figsize=(16, 10))
	font_path = "/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc"
	total = len(who_said)
	cols = 5
	rows = int(math.ceil(total / cols))
	for i, w in enumerate(who_said):
	all_speaches = "".join(who_said[w])
	result = tokenize(all_speaches)
	counter = Counter(result)
	freq = dict((k, f) for k, f in counter.most_common())
	wordcloud = WordCloud(
	background_color="white", font_path=font_path,
	width=300, height=150).generate_from_frequencies(freq)

	ax = plt.subplot(rows, cols, i + 1)
	ax.imshow(wordcloud)
	ax.axis("off")
	ax.set_title(w)

	plt.tight_layout()
	plt.savefig("wc.png")
	plt.show()



	if __name__ == "__main__":
	sample_url = "http://www.shugiin.go.jp/internet/itdb_kaigiroku.nsf/html/kaigiroku/023319220161124003.htm"
	main(sample_url)