/orders.py

## orders.py
import urllib
import urllib2
import re


# Defines the function that processes the text using regular expressions and pulls out what we need and saves to a file
def resultScraper(the_page):
       the_results_page = re.sub("\r","",the_page)
       the_results_page = re.sub("\n","",the_results_page)
       the_results_page = re.sub("  +"," ",the_results_page)


       for match in re.finditer('(\d\d\d\d-\d\d\d\d)</td>(.+?)<td valign="top">(\d\d\d\d-\d\d-\d\d)</td>.+?<table border="0" cellspacing="0" cellpadding="0">.+?<tr> <td> <strong> (.+?)</strong>(.+?)Attachments:.+?<a href=\'(.+?)\'>', the_results_page):
               oic_auth = ""
               oic_no = match.group(1)
               oic_date = match.group(3)
               oic_dept = match.group(4)
               oic_text = match.group(5)
               oic_href = match.group(6)
               oic_href = "http://www.pco-bcp.gc.ca/" + oic_href
               oic_href = re.sub("&amp;", "&", oic_href)


               for every in  re.finditer('(</td> </tr> </table> </td> </tr> <tr> <td colspan="6"> <table border="0" cellspacing="0" cellpadding="0"> <tr> <td> <strong> )(.+?)</strong>',oic_text):
               		oic_auth = every.group(2)

               oic_detail = re.sub("<strong>.+?</strong>","",oic_text)
               oic_detail = re.sub("<.+?>","",oic_detail)
               oic_detail = re.sub("  +"," ",oic_detail)

               listing = oic_no + " " + oic_date + " "+ oic_dept + " " + oic_auth + " DETAIL: " + oic_detail + "\n"

               print listing

               # saves to a text file

               f = open("oics.txt","a")
               f.write(listing)


# Defines the function that pulls down the first five OIC listings, then passes the result to the resultScraper function

def oicGetter(counter):

       url = 'http://www.pco-bcp.gc.ca/oic-ddc.asp?lang=eng&Page=secretariats&txtOICID=&txtFromDate=&txtToDate=&txtPrecis=&txtDepartment=&txtAct=&txtChapterNo=&txtChapterYear=&txtBillNo=&rdoComingIntoForce=&DoSearch=Search+%2F+List&pg=' + str(counter)
       req = urllib2.Request(url)
       req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0) If webscrape causes problems, call Glen McGregor 613.235.6685')
       response = urllib2.urlopen(req)
       the_page = response.read()
       time.sleep(1)

       resultScraper(the_page)


# Runs the main function and incrementally increases the page count by 5 each time
counter = 1

while counter < 50:
	oicGetter(counter)
	counter = counter + 5
	import urllib
	import urllib2
	import re


	# Defines the function that processes the text using regular expressions and pulls out what we need and saves to a file
	def resultScraper(the_page):
	the_results_page = re.sub("\r","",the_page)
	the_results_page = re.sub("\n","",the_results_page)
	the_results_page = re.sub(" +"," ",the_results_page)


	for match in re.finditer('(\d\d\d\d-\d\d\d\d)</td>(.+?)<td valign="top">(\d\d\d\d-\d\d-\d\d)</td>.+?<table border="0" cellspacing="0" cellpadding="0">.+?<tr> <td> <strong> (.+?)</strong>(.+?)Attachments:.+?<a href=\'(.+?)\'>', the_results_page):
	oic_auth = ""
	oic_no = match.group(1)
	oic_date = match.group(3)
	oic_dept = match.group(4)
	oic_text = match.group(5)
	oic_href = match.group(6)
	oic_href = "http://www.pco-bcp.gc.ca/" + oic_href
	oic_href = re.sub("&", "&", oic_href)



	for every in re.finditer('(</td> </tr> </table> </td> </tr> <tr> <td colspan="6"> <table border="0" cellspacing="0" cellpadding="0"> <tr> <td> <strong> )(.+?)</strong>',oic_text):
	oic_auth = every.group(2)

	oic_detail = re.sub("<strong>.+?</strong>","",oic_text)
	oic_detail = re.sub("<.+?>","",oic_detail)
	oic_detail = re.sub(" +"," ",oic_detail)

	listing = oic_no + " " + oic_date + " "+ oic_dept + " " + oic_auth + " DETAIL: " + oic_detail + "\n"

	print listing

	# saves to a text file

	f = open("oics.txt","a")
	f.write(listing)







	# Defines the function that pulls down the first five OIC listings, then passes the result to the resultScraper function

	def oicGetter(counter):

	url = 'http://www.pco-bcp.gc.ca/oic-ddc.asp?lang=eng&Page=secretariats&txtOICID=&txtFromDate=&txtToDate=&txtPrecis=&txtDepartment=&txtAct=&txtChapterNo=&txtChapterYear=&txtBillNo=&rdoComingIntoForce=&DoSearch=Search+%2F+List&pg=' + str(counter)
	req = urllib2.Request(url)
	req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0) If webscrape causes problems, call Glen McGregor 613.235.6685')
	response = urllib2.urlopen(req)
	the_page = response.read()
	time.sleep(1)

	resultScraper(the_page)


	# Runs the main function and incrementally increases the page count by 5 each time
	counter = 1

	while counter < 50:
	oicGetter(counter)
	counter = counter + 5