Last active
February 28, 2023 04:56
-
-
Save ridgewell/7aa0b9f7c2960a019cc78a4f1f29e671 to your computer and use it in GitHub Desktop.
Scrape written parliamentary question data from House of Commons (Canada) Order Paper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
# Send a GET request to the webpage | |
url = 'https://www.ourcommons.ca/DocumentViewer/en/44-1/house/sitting-160/order-notice/page-9' | |
response = requests.get(url) | |
# Create a BeautifulSoup object to parse the HTML content | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find all the tables with class 'Item' | |
tables = soup.find_all('table', {'class': 'Item'}) | |
# Loop through the tables and extract the desired information | |
for table in tables: | |
# Extract question number | |
question_number_elem = table.find('b') | |
if question_number_elem is not None: | |
question_number = question_number_elem.text | |
else: | |
continue | |
# Extract date of question | |
date_elem = table.find('td', {'class': 'JustifiedTop ItemPara'}) | |
if date_elem is not None: | |
date_parts = date_elem.text.split('—') | |
if len(date_parts) > 1: | |
date_string = date_parts[1].strip() | |
else: | |
continue | |
else: | |
continue | |
# Extract MP name | |
mp_name_elem = table.find('a', {'class': 'parldata-widget-popup'}) | |
if mp_name_elem is not None: | |
mp_name = mp_name_elem.text | |
else: | |
continue | |
# Extract question body | |
question_body_elem = table.find('td', {'class': 'JustifiedTop ItemPara'}) | |
if question_body_elem is not None: | |
question_body_parts = question_body_elem.text.split('—', maxsplit=1) | |
if len(question_body_parts) > 1: | |
question_body = question_body_parts[1].strip(' -') | |
else: | |
continue | |
else: | |
continue | |
# Print the extracted information | |
print('Question number:', question_number) | |
print('Date of question:', date_string) | |
print('MP name:', mp_name) | |
print('Question body:', question_body) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment