Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Python script to harvest Prince Edward Island Legislative Assembly Hansard PDFs and convert them to ASCII text files
#!/usr/bin/env python
# You may need to do these:
# sudo pip install lxml
# sudo pip install requests
# sudo pip install pdfminer
from lxml import html
import requests
import urlparse
import urllib
import os
# List to hold all the PDF files we're going to harvest
pdfs = []
# Retrienve the main "Daily Debates" page from the Legislative Assembly website
page = requests.get('')
tree = html.fromstring(page.content)
# The 'sittings' -- i.e. "Spring 2005", etc. -- are found in a <select name="selectsitting"> element.
sittings = tree.xpath("//select[@name='selectsitting']/option")
# Make a list of the sitting codes -- the values we can pass as the
# 'selectsitting' parameter to select dates for a given sitting
sittings_codes = [option.attrib['value'] for option in sittings]
# Make a list of the sitting names, like "Spring 2005"
sittings_names= [option.text for option in sittings]
# Iterate through each sitting
for i, sitting_code in enumerate(sittings_codes):
# Retrieve the calendar for the sitting
page = requests.get('' + sitting_code + '&action=Go')
tree = html.fromstring(page.content)
# Get all the sitting days in this setting by looking for URLs like
for elt in tree.xpath("//a[contains(@href,'')]"):
filename = elt.attrib['href'].split('/')[-1]
sitting = elt.attrib['href'].split('/')[-3]
sitting_directory = 'documents/' + sitting
if not os.path.exists(sitting_directory):
urllib.urlretrieve (elt.attrib['href'], sitting_directory + '/' + filename)
os.system(("pdftotext -raw -enc UTF-8 %s") % sitting_directory + '/' + filename)
os.remove(sitting_directory + '/' + filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment