reinvented/harvest-hansard.py

## harvest-hansard.py
#!/usr/bin/env python

# You may need to do these:
#
# sudo pip install lxml
# sudo pip install requests
# sudo pip install pdfminer

from lxml import html
import requests
import urlparse
import urllib
import os

# List to hold all the PDF files we're going to harvest
pdfs = []

# Retrienve the main "Daily Debates" page from the Legislative Assembly website
page = requests.get('http://www.assembly.pe.ca/hansard/index.php')
tree = html.fromstring(page.content)

# The 'sittings' -- i.e. "Spring 2005", etc. -- are found in a <select name="selectsitting"> element.
sittings = tree.xpath("//select[@name='selectsitting']/option")

# Make a list of the sitting codes -- the values we can pass as the
# 'selectsitting' parameter to select dates for a given sitting
sittings_codes = [option.attrib['value'] for option in sittings]

# Make a list of the sitting names, like "Spring 2005"
sittings_names= [option.text for option in sittings]

# Iterate through each sitting
for i, sitting_code in enumerate(sittings_codes):

  # Retrieve the calendar for the sitting
  page = requests.get('http://www.assembly.pe.ca/hansard/index.php?selectsitting=' + sitting_code + '&action=Go')
  tree = html.fromstring(page.content)

  # Get all the sitting days in this setting by looking for URLs like http://www.assembly.pe.ca/archives/index.php?file=20161115&number=2&year=2016
  for elt in tree.xpath("//a[contains(@href,'assembly.pe.ca/sittings/')]"):
      filename = elt.attrib['href'].split('/')[-1]
      sitting = elt.attrib['href'].split('/')[-3]
      sitting_directory = 'documents/' + sitting
      if not os.path.exists(sitting_directory):
        os.makedirs(sitting_directory)
      urllib.urlretrieve (elt.attrib['href'], sitting_directory + '/' + filename)
      os.system(("pdftotext -raw -enc UTF-8 %s") % sitting_directory + '/' + filename)
      os.remove(sitting_directory + '/' + filename)
	#!/usr/bin/env python

	# You may need to do these:
	#
	# sudo pip install lxml
	# sudo pip install requests
	# sudo pip install pdfminer

	from lxml import html
	import requests
	import urlparse
	import urllib
	import os

	# List to hold all the PDF files we're going to harvest
	pdfs = []

	# Retrienve the main "Daily Debates" page from the Legislative Assembly website
	page = requests.get('http://www.assembly.pe.ca/hansard/index.php')
	tree = html.fromstring(page.content)

	# The 'sittings' -- i.e. "Spring 2005", etc. -- are found in a <select name="selectsitting"> element.
	sittings = tree.xpath("//select[@name='selectsitting']/option")

	# Make a list of the sitting codes -- the values we can pass as the
	# 'selectsitting' parameter to select dates for a given sitting
	sittings_codes = [option.attrib['value'] for option in sittings]

	# Make a list of the sitting names, like "Spring 2005"
	sittings_names= [option.text for option in sittings]

	# Iterate through each sitting
	for i, sitting_code in enumerate(sittings_codes):

	# Retrieve the calendar for the sitting
	page = requests.get('http://www.assembly.pe.ca/hansard/index.php?selectsitting=' + sitting_code + '&action=Go')
	tree = html.fromstring(page.content)

	# Get all the sitting days in this setting by looking for URLs like http://www.assembly.pe.ca/archives/index.php?file=20161115&number=2&year=2016
	for elt in tree.xpath("//a[contains(@href,'assembly.pe.ca/sittings/')]"):
	filename = elt.attrib['href'].split('/')[-1]
	sitting = elt.attrib['href'].split('/')[-3]
	sitting_directory = 'documents/' + sitting
	if not os.path.exists(sitting_directory):
	os.makedirs(sitting_directory)
	urllib.urlretrieve (elt.attrib['href'], sitting_directory + '/' + filename)
	os.system(("pdftotext -raw -enc UTF-8 %s") % sitting_directory + '/' + filename)
	os.remove(sitting_directory + '/' + filename)