quandyfactory/hamont_election_results.py

## hamont_election_results.py
#!/usr/bin/env python

"""
Grabs election data from the City of Hamilton's GEMS election data pages, converts it into a json object and an sql insert statement.
Yeah, I know: the code is seriously ugly-ass and inelegant. What the hell, it works.

Update: The City changed their election results pages and this script no longer works. I'm working on updating it.

Very Important Note: this code breaks on ward 14, where Rob Pasuta is acclaimed and there are no ward results on the web page.
Instead of fixing it in the code, for now I just fixed the SQL statement manually by removing the trustees from the results.
TODO: go back and update the code to account for wards where a candidate is acclaimed.
"""

import urllib2
import re

try:
    import json
except:
    import simplejson as json

proxy_handler = urllib2.ProxyHandler({})
opener = urllib2.build_opener(proxy_handler)
urllib2.install_opener(opener)

url_template = 'http://old.hamilton.ca/clerk/election/2010-election-results/PollByPoll/results-[[X]].htm'

def make_output():
    output = {} # create output dict that will be converted into a JSON object
    output["ok"] = "true"
    output["results"] = []

    # 207 polls
    pages = xrange(1,208)
    pages = xrange(1,11) # for testing

    for page in pages:
        url = url_template.replace('[[X]]', '%s' % (page))
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        content = response.read()

        # remove white space
        content = '\n'.join([line.strip() for line in content.split('\n')])

        # print content

        # get name
        poll_name = ''
        pattern = '<br>Summary For [0-9]{3,} - [a-zA-Z0-9.\',\- ()/@]+<br>'
        match = re.search(pattern, content)
        if match:
            match_text = content[match.span()[0]:match.span()[1]]
            #print match_text
            match_text = match_text.replace('<br>Summary For ', '')
            match_text = match_text.replace(', All Counters, All Races<br>', '')
            poll_name = match_text
        else:
            print "Poll Name not found for poll %s" % (page)

        # get registered voter count
        registered_voter_count = 0
        pattern = '<td>Registered Voters [0-9]+'
        match = re.search(pattern, content)
        if match:
            match_text = content[match.span()[0]:match.span()[1]]
            #print match_text
            match_text = match_text.replace('<td>Registered Voters ', '').replace('</td>', '')
            try:
                registered_voter_count = int(match_text)
            except:
                pass
        else:
            print "Registered Voters not found for poll %s" % (page)

        # get ward
        ward = 0
        pattern = '<th align=left>COUNCILLOR WARD [0-9]+</th>'
        match = re.search(pattern, content)
        if match:
            match_text = content[match.span()[0]:match.span()[1]]
            match_text = match_text.replace('<th align=left>COUNCILLOR WARD ', '').replace('</th>', '')
            try:
                ward = int(match_text)
            except:
                pass

        # get candidate numbers on same line with tab separation
        pattern = '</td>\n<td align=(right|left)>'
        content2 = re.sub(pattern, '\t', content)

        # remove HTML tags
        pattern = '(\<(/?[^\>]+)\>)'
        content3 = re.sub(pattern, '', content2)

        # flag start of mayoral candidates
        pattern = """MAYOR

Total

Number of Polls
1

Polls Reporting
1	100.0  %"""
        content3 = content3.replace(pattern, '[[MAYOR STARTS]]')

        # flag end of mayoral candidates and start of council candidates
        pattern = """

COUNCILLOR WARD [0-9]+

Total

Number of Polls
1

Polls Reporting
1	100.0  %"""
        content3 = re.sub(pattern, '[[COUNCIL STARTS]]', content3)

        # get rid of multiple tabs
        content3 = content3.replace('\t\t', '\t')

        print content3 # testing

        #start walking the rows to append the values to output['results']
        which = ''
        for row in content3.split('\n'):
            # exit if finished appending mayoral and council candidates
            if row.strip() == '' and which == 'Ward %s' % (ward):
                break
            if row.strip() == '[[MAYOR STARTS]]':
                which = 'Mayor'
            elif row.strip() == '[[COUNCIL STARTS]]':
                which = 'Ward %s' % (ward)
            if which != '' and '\t' in row.strip():
                fields = row.split('\t')
                this_dict = {
                    "poll": page,
                    "poll_name": poll_name,
                    "registered_voters": registered_voter_count,
                    "ward": ward,
                    "election": which,
                    "web_page": url,
                    "name": "",
                    "votes": "",
                }
                try: this_dict["name"] = fields[0]
                except: pass
                try: this_dict["votes"] = int(fields[1])
                except: pass

                output['results'].append(this_dict)

        print 'Poll %s Complete' % (page)

    # save json object
    with open('hamont_election_results.json', 'w') as file:
        file.write(json.dumps(output))

    print "json saved"

def make_tab():
    """
    Takes the JSON output from make_output() and converts it into an
    SQL insert statement.
    """
    with open('hamont_election_results.json') as file:
        output = json.loads(file.read())
    rows = []

    results = output['results']
    for r in results:
        this_result = "(%s\t%s\t%s\t%s\t%s\t%s\%s\t%s)" % (
            r['poll'], r['poll_name'], r['ward'], r['election'], r['name'], r['votes'], r['registered_voters'], r['web_page']
        )
        rows.append(this_result)

    with open('hamont_election_results.tsv', 'w') as file:
        file.write('\n'.join(rows))

    print "tsv saved"


def make_sql():
    """
    Takes the JSON output from make_output() and converts it into an
    SQL insert statement.
    """
    with open('hamont_election_results.json') as file:
        output = json.loads(file.read())
    # create SQL insert statement
    statement = []
    statement.append('insert into election_results')
    statement.append('(poll, poll_name, ward, election, name, votes, registered_voters, web_page)')
    statement.append('values')
    rows = []

    results = output['results']
    for r in results:
        this_result = "(%s, '%s', %s, '%s', '%s', %s, %s, '%s')" % (
            r['poll'], r['poll_name'].replace("'", "\'"), r['ward'], r['election'], r['name'].replace("'", "\'"), r['votes'], r['registered_voters'], r['web_page']
        )
        rows.append(this_result)

    all_rows = ', '.join(rows)
    statement.append(all_rows)

    with open('hamont_election_results.sql', 'w') as file:
        file.write(' '.join(statement))

    print "sql saved"
	#!/usr/bin/env python

	"""
	Grabs election data from the City of Hamilton's GEMS election data pages, converts it into a json object and an sql insert statement.
	Yeah, I know: the code is seriously ugly-ass and inelegant. What the hell, it works.

	Update: The City changed their election results pages and this script no longer works. I'm working on updating it.

	Very Important Note: this code breaks on ward 14, where Rob Pasuta is acclaimed and there are no ward results on the web page.
	Instead of fixing it in the code, for now I just fixed the SQL statement manually by removing the trustees from the results.
	TODO: go back and update the code to account for wards where a candidate is acclaimed.
	"""

	import urllib2
	import re

	try:
	import json
	except:
	import simplejson as json

	proxy_handler = urllib2.ProxyHandler({})
	opener = urllib2.build_opener(proxy_handler)
	urllib2.install_opener(opener)

	url_template = 'http://old.hamilton.ca/clerk/election/2010-election-results/PollByPoll/results-[[X]].htm'

	def make_output():
	output = {} # create output dict that will be converted into a JSON object
	output["ok"] = "true"
	output["results"] = []

	# 207 polls
	pages = xrange(1,208)
	pages = xrange(1,11) # for testing

	for page in pages:
	url = url_template.replace('[[X]]', '%s' % (page))
	request = urllib2.Request(url)
	response = urllib2.urlopen(request)
	content = response.read()

	# remove white space
	content = '\n'.join([line.strip() for line in content.split('\n')])

	# print content

	# get name
	poll_name = ''
	pattern = '<br>Summary For [0-9]{3,} - [a-zA-Z0-9.\',\- ()/@]+<br>'
	match = re.search(pattern, content)
	if match:
	match_text = content[match.span()[0]:match.span()[1]]
	#print match_text
	match_text = match_text.replace('<br>Summary For ', '')
	match_text = match_text.replace(', All Counters, All Races<br>', '')
	poll_name = match_text
	else:
	print "Poll Name not found for poll %s" % (page)

	# get registered voter count
	registered_voter_count = 0
	pattern = '<td>Registered Voters [0-9]+'
	match = re.search(pattern, content)
	if match:
	match_text = content[match.span()[0]:match.span()[1]]
	#print match_text
	match_text = match_text.replace('<td>Registered Voters ', '').replace('</td>', '')
	try:
	registered_voter_count = int(match_text)
	except:
	pass
	else:
	print "Registered Voters not found for poll %s" % (page)

	# get ward
	ward = 0
	pattern = '<th align=left>COUNCILLOR WARD [0-9]+</th>'
	match = re.search(pattern, content)
	if match:
	match_text = content[match.span()[0]:match.span()[1]]
	match_text = match_text.replace('<th align=left>COUNCILLOR WARD ', '').replace('</th>', '')
	try:
	ward = int(match_text)
	except:
	pass

	# get candidate numbers on same line with tab separation
	pattern = '</td>\n<td align=(right\|left)>'
	content2 = re.sub(pattern, '\t', content)

	# remove HTML tags
	pattern = '(\<(/?[^\>]+)\>)'
	content3 = re.sub(pattern, '', content2)

	# flag start of mayoral candidates
	pattern = """MAYOR

	Total

	Number of Polls
	1

	Polls Reporting
	1 100.0 %"""
	content3 = content3.replace(pattern, '[[MAYOR STARTS]]')

	# flag end of mayoral candidates and start of council candidates
	pattern = """

	COUNCILLOR WARD [0-9]+

	Total

	Number of Polls
	1

	Polls Reporting
	1 100.0 %"""
	content3 = re.sub(pattern, '[[COUNCIL STARTS]]', content3)

	# get rid of multiple tabs
	content3 = content3.replace('\t\t', '\t')

	print content3 # testing

	#start walking the rows to append the values to output['results']
	which = ''
	for row in content3.split('\n'):
	# exit if finished appending mayoral and council candidates
	if row.strip() == '' and which == 'Ward %s' % (ward):
	break
	if row.strip() == '[[MAYOR STARTS]]':
	which = 'Mayor'
	elif row.strip() == '[[COUNCIL STARTS]]':
	which = 'Ward %s' % (ward)
	if which != '' and '\t' in row.strip():
	fields = row.split('\t')
	this_dict = {
	"poll": page,
	"poll_name": poll_name,
	"registered_voters": registered_voter_count,
	"ward": ward,
	"election": which,
	"web_page": url,
	"name": "",
	"votes": "",
	}
	try: this_dict["name"] = fields[0]
	except: pass
	try: this_dict["votes"] = int(fields[1])
	except: pass

	output['results'].append(this_dict)

	print 'Poll %s Complete' % (page)

	# save json object
	with open('hamont_election_results.json', 'w') as file:
	file.write(json.dumps(output))

	print "json saved"

	def make_tab():
	"""
	Takes the JSON output from make_output() and converts it into an
	SQL insert statement.
	"""
	with open('hamont_election_results.json') as file:
	output = json.loads(file.read())
	rows = []

	results = output['results']
	for r in results:
	this_result = "(%s\t%s\t%s\t%s\t%s\t%s\%s\t%s)" % (
	r['poll'], r['poll_name'], r['ward'], r['election'], r['name'], r['votes'], r['registered_voters'], r['web_page']
	)
	rows.append(this_result)

	with open('hamont_election_results.tsv', 'w') as file:
	file.write('\n'.join(rows))

	print "tsv saved"


	def make_sql():
	"""
	Takes the JSON output from make_output() and converts it into an
	SQL insert statement.
	"""
	with open('hamont_election_results.json') as file:
	output = json.loads(file.read())
	# create SQL insert statement
	statement = []
	statement.append('insert into election_results')
	statement.append('(poll, poll_name, ward, election, name, votes, registered_voters, web_page)')
	statement.append('values')
	rows = []

	results = output['results']
	for r in results:
	this_result = "(%s, '%s', %s, '%s', '%s', %s, %s, '%s')" % (
	r['poll'], r['poll_name'].replace("'", "\'"), r['ward'], r['election'], r['name'].replace("'", "\'"), r['votes'], r['registered_voters'], r['web_page']
	)
	rows.append(this_result)

	all_rows = ', '.join(rows)
	statement.append(all_rows)

	with open('hamont_election_results.sql', 'w') as file:
	file.write(' '.join(statement))

	print "sql saved"