RAbraham/tabs_scraper

## tabs_scraper
import json


def search_toronto_site(text,committee,from_date,to_date,item_status):
    from functools import partial
    #from twill.commands import *
    import twill.commands as crawler
    SEARCH_FORM = 1
    search_page = "http://app.toronto.ca/tmmis/findAgendaItem.do?function=doPrepare"
    fv = partial(crawler.fv,SEARCH_FORM)

    # Grab the voting form
    crawler.go(search_page)

    # Fill out the form with the id, and set it to download
    fv("word_or_phrase",text )
    if committee:
        fv("decision_body",committee)
    fv("fromDate",from_date)
    fv("toDate",to_date)
    fv("item_status",item_status)

    crawler.submit()
    return crawler.show()

def extract_agenda_items(html_page):
    from bs4 import BeautifulSoup
    FIRST_ROW_AFTER_HEADER = 1

    soup = BeautifulSoup(html_page)

    rows = soup.find("table", {"id": "searchResultsTable"}).find_all("tr")[FIRST_ROW_AFTER_HEADER:]

    agenda_items = []

    for result in rows:
        meeting_date = result.find("td", {"class": "meetingDate"}).get_text()
        item_num = result.find("td", {"class": "reference"}).find("a").get_text()
        item_url = "http://app.toronto.ca" + str(result).split('(')[1].split(')')[0][1:-1]
        title = result.find("td", {"class": "agendaItemTitle"}).get_text()
        committee = result.find("td", {"class": "decisionBodyName"}).get_text()
        agenda_items.append({'meeting_date': meeting_date,
                             'item_num': item_num,
                             'item_url': item_url,
                             'title': title,
                             'committee': committee})

        #print "Info: %s / %s / %s / %s / %s" % (meeting_date, item_num, item_url, title, committee)

    return agenda_items

#############################################################################################
#CONSTANTS


##################################################################################


TEXT = 'Announcements'
WORD_GRAFFITI = 'Request'
COMMITTEE = 'Aboriginal Affairs Committee (2010-2014)'
COM_GRAFFITI  = "Graffiti Panel"
FROM_DATE = "2012-11-30"
TO_DATE = "2012-11-30"
ITEM_STATUS = "Adopted"


#results_html = search_toronto_site(WORD_GRAFFITI,COM_GRAFFITI,FROM_DATE,TO_DATE,ITEM_STATUS)
results_html = search_toronto_site("","","","","")
agenda_items = extract_agenda_items(results_html)

print(results_html)
print("Total Results: %s" % (len(agenda_items)))
print json.dumps(agenda_items, sort_keys=True, indent=4, separators=(',', ': '))
	import json



	def search_toronto_site(text,committee,from_date,to_date,item_status):
	from functools import partial
	#from twill.commands import *
	import twill.commands as crawler
	SEARCH_FORM = 1
	search_page = "http://app.toronto.ca/tmmis/findAgendaItem.do?function=doPrepare"
	fv = partial(crawler.fv,SEARCH_FORM)

	# Grab the voting form
	crawler.go(search_page)

	# Fill out the form with the id, and set it to download
	fv("word_or_phrase",text )
	if committee:
	fv("decision_body",committee)
	fv("fromDate",from_date)
	fv("toDate",to_date)
	fv("item_status",item_status)

	crawler.submit()
	return crawler.show()

	def extract_agenda_items(html_page):
	from bs4 import BeautifulSoup
	FIRST_ROW_AFTER_HEADER = 1

	soup = BeautifulSoup(html_page)

	rows = soup.find("table", {"id": "searchResultsTable"}).find_all("tr")[FIRST_ROW_AFTER_HEADER:]

	agenda_items = []

	for result in rows:
	meeting_date = result.find("td", {"class": "meetingDate"}).get_text()
	item_num = result.find("td", {"class": "reference"}).find("a").get_text()
	item_url = "http://app.toronto.ca" + str(result).split('(')[1].split(')')[0][1:-1]
	title = result.find("td", {"class": "agendaItemTitle"}).get_text()
	committee = result.find("td", {"class": "decisionBodyName"}).get_text()
	agenda_items.append({'meeting_date': meeting_date,
	'item_num': item_num,
	'item_url': item_url,
	'title': title,
	'committee': committee})

	#print "Info: %s / %s / %s / %s / %s" % (meeting_date, item_num, item_url, title, committee)

	return agenda_items

	#############################################################################################
	#CONSTANTS


	##################################################################################


	TEXT = 'Announcements'
	WORD_GRAFFITI = 'Request'
	COMMITTEE = 'Aboriginal Affairs Committee (2010-2014)'
	COM_GRAFFITI = "Graffiti Panel"
	FROM_DATE = "2012-11-30"
	TO_DATE = "2012-11-30"
	ITEM_STATUS = "Adopted"


	#results_html = search_toronto_site(WORD_GRAFFITI,COM_GRAFFITI,FROM_DATE,TO_DATE,ITEM_STATUS)
	results_html = search_toronto_site("","","","","")
	agenda_items = extract_agenda_items(results_html)

	print(results_html)
	print("Total Results: %s" % (len(agenda_items)))
	print json.dumps(agenda_items, sort_keys=True, indent=4, separators=(',', ': '))