Created December 7, 2018 19:44
Python script to harvest Prince Edward Island Legislative Assembly Hansard PDFs and convert them to ASCII text files
#!/usr/bin/env python
# You may need to do these:
# sudo pip install lxml
# sudo pip install requests
# sudo pip install pdfminer
from lxml import html
import requests
import urlparse
import urllib
import os
# List to hold all the PDF files we're going to harvest
pdfs = []
# Retrienve the main "Daily Debates" page from the Legislative Assembly website
page = requests.get('')
tree = html.fromstring(page.content)
# The 'sittings' -- i.e. "Spring 2005", etc. -- are found in a <select name="selectsitting"> element.
sittings = tree.xpath("//select[@name='selectsitting']/option")
# Make a list of the sitting codes -- the values we can pass as the
# 'selectsitting' parameter to select dates for a given sitting
sittings_codes = [option.attrib['value'] for option in sittings]
# Make a list of the sitting names, like "Spring 2005"
sittings_names= [option.text for option in sittings]
# Iterate through each sitting
for i, sitting_code in enumerate(sittings_codes):
# Retrieve the calendar for the sitting
page = requests.get('' + sitting_code + '&action=Go')
tree = html.fromstring(page.content)
# Get all the sitting days in this setting by looking for URLs like
for elt in tree.xpath("//a[contains(@href,'')]"):
filename = elt.attrib['href'].split('/')[-1]
sitting = elt.attrib['href'].split('/')[-3]
sitting_directory = 'documents/' + sitting
if not os.path.exists(sitting_directory):
urllib.urlretrieve (elt.attrib['href'], sitting_directory + '/' + filename)
os.system(("pdftotext -raw -enc UTF-8 %s") % sitting_directory + '/' + filename)
os.remove(sitting_directory + '/' + filename)
