njwilson23/voting-records.py

## voting-records.py
import requests
import bs4
import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

def parse_index():
    """ Return a list of sub-pages with voting data """
    response = requests.get("https://www.leg.bc.ca/content-hansard/Index/40th4th/2015votesmhds.htm")
    if response.status_code != 200:
        raise IOError(response.status_code)

    soup = bs4.BeautifulSoup(response.content.decode("utf-16"), "html5lib")
    pages = set([a.attrs["href"].split("#")[0]
                 for a in soup.find_all("a", attrs={"href": re.compile(r"2015VOTES.\.htm.*")})])
    return pages

def parse_page(page):
    response = requests.get("https://www.leg.bc.ca/content-hansard/Index/40th4th/{}".format(page))
    if response.status_code != 200:
        raise IOError(response.status_code)

    soup = bs4.BeautifulSoup(response.content.decode("utf-16"), "html5lib")
    sections = soup.find_all("div", attrs={"title": re.compile(".*")})
    d = []
    for section in sections:
        if ";" in section["title"]:
            d.append(parse_mla_section(section))
    return d

def parse_mla_section(div):
    mla_name, section_name = div["title"].split(";", 1)
    d_ = {}
    d = {"mla_name": mla_name, "section": section_name.strip(), "motions": d_}
    for li in div.find_all("li"):
        contents = " ".join(list(li.strings)).strip()
        if "Yea" in contents:
            i = contents.index("Yea")
            d_[contents[:i].strip(" ,")] = "Y"
        elif "Nay" in contents:
            i = contents.index("Nay")
            d_[contents[:i].strip(" ,")] = "N"
    return d

# Let 'er rip

pages = parse_index()
executor = ThreadPoolExecutor(max_workers=8)

futures = []
for page in pages:
    futures.append(executor.submit(parse_page, page))

votes = []
for future in as_completed(futures):
    votes.extend(future.result())

with open("vote-data.json", "w") as f:
    f.write(json.dumps(votes))
	import requests
	import bs4
	import json
	import re
	from concurrent.futures import ThreadPoolExecutor, as_completed

	def parse_index():
	""" Return a list of sub-pages with voting data """
	response = requests.get("https://www.leg.bc.ca/content-hansard/Index/40th4th/2015votesmhds.htm")
	if response.status_code != 200:
	raise IOError(response.status_code)

	soup = bs4.BeautifulSoup(response.content.decode("utf-16"), "html5lib")
	pages = set([a.attrs["href"].split("#")[0]
	for a in soup.find_all("a", attrs={"href": re.compile(r"2015VOTES.\.htm.*")})])
	return pages

	def parse_page(page):
	response = requests.get("https://www.leg.bc.ca/content-hansard/Index/40th4th/{}".format(page))
	if response.status_code != 200:
	raise IOError(response.status_code)

	soup = bs4.BeautifulSoup(response.content.decode("utf-16"), "html5lib")
	sections = soup.find_all("div", attrs={"title": re.compile(".*")})
	d = []
	for section in sections:
	if ";" in section["title"]:
	d.append(parse_mla_section(section))
	return d

	def parse_mla_section(div):
	mla_name, section_name = div["title"].split(";", 1)
	d_ = {}
	d = {"mla_name": mla_name, "section": section_name.strip(), "motions": d_}
	for li in div.find_all("li"):
	contents = " ".join(list(li.strings)).strip()
	if "Yea" in contents:
	i = contents.index("Yea")
	d_[contents[:i].strip(" ,")] = "Y"
	elif "Nay" in contents:
	i = contents.index("Nay")
	d_[contents[:i].strip(" ,")] = "N"
	return d

	# Let 'er rip

	pages = parse_index()
	executor = ThreadPoolExecutor(max_workers=8)

	futures = []
	for page in pages:
	futures.append(executor.submit(parse_page, page))

	votes = []
	for future in as_completed(futures):
	votes.extend(future.result())

	with open("vote-data.json", "w") as f:
	f.write(json.dumps(votes))