Skip to content

Instantly share code, notes, and snippets.

@reinvented
Created December 7, 2018 19:44
Show Gist options
  • Save reinvented/b4af6c04d3f73fa8f1df178aec5cddf2 to your computer and use it in GitHub Desktop.
Save reinvented/b4af6c04d3f73fa8f1df178aec5cddf2 to your computer and use it in GitHub Desktop.
Python script to harvest Prince Edward Island Legislative Assembly Hansard PDFs and convert them to ASCII text files
#!/usr/bin/env python
# You may need to do these:
#
# sudo pip install lxml
# sudo pip install requests
# sudo pip install pdfminer
from lxml import html
import requests
import urlparse
import urllib
import os
# List to hold all the PDF files we're going to harvest
pdfs = []
# Retrienve the main "Daily Debates" page from the Legislative Assembly website
page = requests.get('http://www.assembly.pe.ca/hansard/index.php')
tree = html.fromstring(page.content)
# The 'sittings' -- i.e. "Spring 2005", etc. -- are found in a <select name="selectsitting"> element.
sittings = tree.xpath("//select[@name='selectsitting']/option")
# Make a list of the sitting codes -- the values we can pass as the
# 'selectsitting' parameter to select dates for a given sitting
sittings_codes = [option.attrib['value'] for option in sittings]
# Make a list of the sitting names, like "Spring 2005"
sittings_names= [option.text for option in sittings]
# Iterate through each sitting
for i, sitting_code in enumerate(sittings_codes):
# Retrieve the calendar for the sitting
page = requests.get('http://www.assembly.pe.ca/hansard/index.php?selectsitting=' + sitting_code + '&action=Go')
tree = html.fromstring(page.content)
# Get all the sitting days in this setting by looking for URLs like http://www.assembly.pe.ca/archives/index.php?file=20161115&number=2&year=2016
for elt in tree.xpath("//a[contains(@href,'assembly.pe.ca/sittings/')]"):
filename = elt.attrib['href'].split('/')[-1]
sitting = elt.attrib['href'].split('/')[-3]
sitting_directory = 'documents/' + sitting
if not os.path.exists(sitting_directory):
os.makedirs(sitting_directory)
urllib.urlretrieve (elt.attrib['href'], sitting_directory + '/' + filename)
os.system(("pdftotext -raw -enc UTF-8 %s") % sitting_directory + '/' + filename)
os.remove(sitting_directory + '/' + filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment