Skip to content

Instantly share code, notes, and snippets.

@RAbraham
Created December 2, 2012 21:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RAbraham/4191170 to your computer and use it in GitHub Desktop.
Save RAbraham/4191170 to your computer and use it in GitHub Desktop.
Scraping the toronto muncipal government website for upcoming meetings based on input search text
import json
def search_toronto_site(text,committee,from_date,to_date,item_status):
from functools import partial
#from twill.commands import *
import twill.commands as crawler
SEARCH_FORM = 1
search_page = "http://app.toronto.ca/tmmis/findAgendaItem.do?function=doPrepare"
fv = partial(crawler.fv,SEARCH_FORM)
# Grab the voting form
crawler.go(search_page)
# Fill out the form with the id, and set it to download
fv("word_or_phrase",text )
if committee:
fv("decision_body",committee)
fv("fromDate",from_date)
fv("toDate",to_date)
fv("item_status",item_status)
crawler.submit()
return crawler.show()
def extract_agenda_items(html_page):
from bs4 import BeautifulSoup
FIRST_ROW_AFTER_HEADER = 1
soup = BeautifulSoup(html_page)
rows = soup.find("table", {"id": "searchResultsTable"}).find_all("tr")[FIRST_ROW_AFTER_HEADER:]
agenda_items = []
for result in rows:
meeting_date = result.find("td", {"class": "meetingDate"}).get_text()
item_num = result.find("td", {"class": "reference"}).find("a").get_text()
item_url = "http://app.toronto.ca" + str(result).split('(')[1].split(')')[0][1:-1]
title = result.find("td", {"class": "agendaItemTitle"}).get_text()
committee = result.find("td", {"class": "decisionBodyName"}).get_text()
agenda_items.append({'meeting_date': meeting_date,
'item_num': item_num,
'item_url': item_url,
'title': title,
'committee': committee})
#print "Info: %s / %s / %s / %s / %s" % (meeting_date, item_num, item_url, title, committee)
return agenda_items
#############################################################################################
#CONSTANTS
##################################################################################
TEXT = 'Announcements'
WORD_GRAFFITI = 'Request'
COMMITTEE = 'Aboriginal Affairs Committee (2010-2014)'
COM_GRAFFITI = "Graffiti Panel"
FROM_DATE = "2012-11-30"
TO_DATE = "2012-11-30"
ITEM_STATUS = "Adopted"
#results_html = search_toronto_site(WORD_GRAFFITI,COM_GRAFFITI,FROM_DATE,TO_DATE,ITEM_STATUS)
results_html = search_toronto_site("","","","","")
agenda_items = extract_agenda_items(results_html)
print(results_html)
print("Total Results: %s" % (len(agenda_items)))
print json.dumps(agenda_items, sort_keys=True, indent=4, separators=(',', ': '))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment