Created
May 12, 2017 05:09
-
-
Save njwilson23/e13f1b8f48a6ce579e8124db71b9f346 to your computer and use it in GitHub Desktop.
Quick scraping code for BC MLA voting records
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import json | |
import re | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
def parse_index(): | |
""" Return a list of sub-pages with voting data """ | |
response = requests.get("https://www.leg.bc.ca/content-hansard/Index/40th4th/2015votesmhds.htm") | |
if response.status_code != 200: | |
raise IOError(response.status_code) | |
soup = bs4.BeautifulSoup(response.content.decode("utf-16"), "html5lib") | |
pages = set([a.attrs["href"].split("#")[0] | |
for a in soup.find_all("a", attrs={"href": re.compile(r"2015VOTES.\.htm.*")})]) | |
return pages | |
def parse_page(page): | |
response = requests.get("https://www.leg.bc.ca/content-hansard/Index/40th4th/{}".format(page)) | |
if response.status_code != 200: | |
raise IOError(response.status_code) | |
soup = bs4.BeautifulSoup(response.content.decode("utf-16"), "html5lib") | |
sections = soup.find_all("div", attrs={"title": re.compile(".*")}) | |
d = [] | |
for section in sections: | |
if ";" in section["title"]: | |
d.append(parse_mla_section(section)) | |
return d | |
def parse_mla_section(div): | |
mla_name, section_name = div["title"].split(";", 1) | |
d_ = {} | |
d = {"mla_name": mla_name, "section": section_name.strip(), "motions": d_} | |
for li in div.find_all("li"): | |
contents = " ".join(list(li.strings)).strip() | |
if "Yea" in contents: | |
i = contents.index("Yea") | |
d_[contents[:i].strip(" ,")] = "Y" | |
elif "Nay" in contents: | |
i = contents.index("Nay") | |
d_[contents[:i].strip(" ,")] = "N" | |
return d | |
# Let 'er rip | |
pages = parse_index() | |
executor = ThreadPoolExecutor(max_workers=8) | |
futures = [] | |
for page in pages: | |
futures.append(executor.submit(parse_page, page)) | |
votes = [] | |
for future in as_completed(futures): | |
votes.extend(future.result()) | |
with open("vote-data.json", "w") as f: | |
f.write(json.dumps(votes)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment