Skip to content

Instantly share code, notes, and snippets.

@ridgewell
Last active February 28, 2023 04:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ridgewell/7aa0b9f7c2960a019cc78a4f1f29e671 to your computer and use it in GitHub Desktop.
Save ridgewell/7aa0b9f7c2960a019cc78a4f1f29e671 to your computer and use it in GitHub Desktop.
Scrape written parliamentary question data from House of Commons (Canada) Order Paper
import requests
from bs4 import BeautifulSoup
# Send a GET request to the webpage
url = 'https://www.ourcommons.ca/DocumentViewer/en/44-1/house/sitting-160/order-notice/page-9'
response = requests.get(url)
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the tables with class 'Item'
tables = soup.find_all('table', {'class': 'Item'})
# Loop through the tables and extract the desired information
for table in tables:
# Extract question number
question_number_elem = table.find('b')
if question_number_elem is not None:
question_number = question_number_elem.text
else:
continue
# Extract date of question
date_elem = table.find('td', {'class': 'JustifiedTop ItemPara'})
if date_elem is not None:
date_parts = date_elem.text.split('—')
if len(date_parts) > 1:
date_string = date_parts[1].strip()
else:
continue
else:
continue
# Extract MP name
mp_name_elem = table.find('a', {'class': 'parldata-widget-popup'})
if mp_name_elem is not None:
mp_name = mp_name_elem.text
else:
continue
# Extract question body
question_body_elem = table.find('td', {'class': 'JustifiedTop ItemPara'})
if question_body_elem is not None:
question_body_parts = question_body_elem.text.split('—', maxsplit=1)
if len(question_body_parts) > 1:
question_body = question_body_parts[1].strip(' -')
else:
continue
else:
continue
# Print the extracted information
print('Question number:', question_number)
print('Date of question:', date_string)
print('MP name:', mp_name)
print('Question body:', question_body)
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment