Skip to content

Instantly share code, notes, and snippets.

Created May 17, 2011 17:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/976909 to your computer and use it in GitHub Desktop.
Save anonymous/976909 to your computer and use it in GitHub Desktop.
Simple Python script to scrape the government of Canada's Orders-in-Council database
import urllib
import urllib2
import re
# Defines the function that processes the text using regular expressions and pulls out what we need and saves to a file
def resultScraper(the_page):
the_results_page = re.sub("\r","",the_page)
the_results_page = re.sub("\n","",the_results_page)
the_results_page = re.sub(" +"," ",the_results_page)
for match in re.finditer('(\d\d\d\d-\d\d\d\d)</td>(.+?)<td valign="top">(\d\d\d\d-\d\d-\d\d)</td>.+?<table border="0" cellspacing="0" cellpadding="0">.+?<tr> <td> <strong> (.+?)</strong>(.+?)Attachments:.+?<a href=\'(.+?)\'>', the_results_page):
oic_auth = ""
oic_no = match.group(1)
oic_date = match.group(3)
oic_dept = match.group(4)
oic_text = match.group(5)
oic_href = match.group(6)
oic_href = "http://www.pco-bcp.gc.ca/" + oic_href
oic_href = re.sub("&amp;", "&", oic_href)
for every in re.finditer('(</td> </tr> </table> </td> </tr> <tr> <td colspan="6"> <table border="0" cellspacing="0" cellpadding="0"> <tr> <td> <strong> )(.+?)</strong>',oic_text):
oic_auth = every.group(2)
oic_detail = re.sub("<strong>.+?</strong>","",oic_text)
oic_detail = re.sub("<.+?>","",oic_detail)
oic_detail = re.sub(" +"," ",oic_detail)
listing = oic_no + " " + oic_date + " "+ oic_dept + " " + oic_auth + " DETAIL: " + oic_detail + "\n"
print listing
# saves to a text file
f = open("oics.txt","a")
f.write(listing)
# Defines the function that pulls down the first five OIC listings, then passes the result to the resultScraper function
def oicGetter(counter):
url = 'http://www.pco-bcp.gc.ca/oic-ddc.asp?lang=eng&Page=secretariats&txtOICID=&txtFromDate=&txtToDate=&txtPrecis=&txtDepartment=&txtAct=&txtChapterNo=&txtChapterYear=&txtBillNo=&rdoComingIntoForce=&DoSearch=Search+%2F+List&pg=' + str(counter)
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0) If webscrape causes problems, call Glen McGregor 613.235.6685')
response = urllib2.urlopen(req)
the_page = response.read()
time.sleep(1)
resultScraper(the_page)
# Runs the main function and incrementally increases the page count by 5 each time
counter = 1
while counter < 50:
oicGetter(counter)
counter = counter + 5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment