Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Nasty hack to get city of #HamOnt election data into a usable format.
#!/usr/bin/env python
"""
Grabs election data from the City of Hamilton's GEMS election data pages, converts it into a json object and an sql insert statement.
Yeah, I know: the code is seriously ugly-ass and inelegant. What the hell, it works.
Update: The City changed their election results pages and this script no longer works. I'm working on updating it.
Very Important Note: this code breaks on ward 14, where Rob Pasuta is acclaimed and there are no ward results on the web page.
Instead of fixing it in the code, for now I just fixed the SQL statement manually by removing the trustees from the results.
TODO: go back and update the code to account for wards where a candidate is acclaimed.
"""
import urllib2
import re
try:
import json
except:
import simplejson as json
proxy_handler = urllib2.ProxyHandler({})
opener = urllib2.build_opener(proxy_handler)
urllib2.install_opener(opener)
url_template = 'http://old.hamilton.ca/clerk/election/2010-election-results/PollByPoll/results-[[X]].htm'
def make_output():
output = {} # create output dict that will be converted into a JSON object
output["ok"] = "true"
output["results"] = []
# 207 polls
pages = xrange(1,208)
pages = xrange(1,11) # for testing
for page in pages:
url = url_template.replace('[[X]]', '%s' % (page))
request = urllib2.Request(url)
response = urllib2.urlopen(request)
content = response.read()
# remove white space
content = '\n'.join([line.strip() for line in content.split('\n')])
# print content
# get name
poll_name = ''
pattern = '<br>Summary For [0-9]{3,} - [a-zA-Z0-9.\',\- ()/@]+<br>'
match = re.search(pattern, content)
if match:
match_text = content[match.span()[0]:match.span()[1]]
#print match_text
match_text = match_text.replace('<br>Summary For ', '')
match_text = match_text.replace(', All Counters, All Races<br>', '')
poll_name = match_text
else:
print "Poll Name not found for poll %s" % (page)
# get registered voter count
registered_voter_count = 0
pattern = '<td>Registered Voters [0-9]+'
match = re.search(pattern, content)
if match:
match_text = content[match.span()[0]:match.span()[1]]
#print match_text
match_text = match_text.replace('<td>Registered Voters ', '').replace('</td>', '')
try:
registered_voter_count = int(match_text)
except:
pass
else:
print "Registered Voters not found for poll %s" % (page)
# get ward
ward = 0
pattern = '<th align=left>COUNCILLOR WARD [0-9]+</th>'
match = re.search(pattern, content)
if match:
match_text = content[match.span()[0]:match.span()[1]]
match_text = match_text.replace('<th align=left>COUNCILLOR WARD ', '').replace('</th>', '')
try:
ward = int(match_text)
except:
pass
# get candidate numbers on same line with tab separation
pattern = '</td>\n<td align=(right|left)>'
content2 = re.sub(pattern, '\t', content)
# remove HTML tags
pattern = '(\<(/?[^\>]+)\>)'
content3 = re.sub(pattern, '', content2)
# flag start of mayoral candidates
pattern = """MAYOR
Total
Number of Polls
1
Polls Reporting
1 100.0 %"""
content3 = content3.replace(pattern, '[[MAYOR STARTS]]')
# flag end of mayoral candidates and start of council candidates
pattern = """
COUNCILLOR WARD [0-9]+
Total
Number of Polls
1
Polls Reporting
1 100.0 %"""
content3 = re.sub(pattern, '[[COUNCIL STARTS]]', content3)
# get rid of multiple tabs
content3 = content3.replace('\t\t', '\t')
print content3 # testing
#start walking the rows to append the values to output['results']
which = ''
for row in content3.split('\n'):
# exit if finished appending mayoral and council candidates
if row.strip() == '' and which == 'Ward %s' % (ward):
break
if row.strip() == '[[MAYOR STARTS]]':
which = 'Mayor'
elif row.strip() == '[[COUNCIL STARTS]]':
which = 'Ward %s' % (ward)
if which != '' and '\t' in row.strip():
fields = row.split('\t')
this_dict = {
"poll": page,
"poll_name": poll_name,
"registered_voters": registered_voter_count,
"ward": ward,
"election": which,
"web_page": url,
"name": "",
"votes": "",
}
try: this_dict["name"] = fields[0]
except: pass
try: this_dict["votes"] = int(fields[1])
except: pass
output['results'].append(this_dict)
print 'Poll %s Complete' % (page)
# save json object
with open('hamont_election_results.json', 'w') as file:
file.write(json.dumps(output))
print "json saved"
def make_tab():
"""
Takes the JSON output from make_output() and converts it into an
SQL insert statement.
"""
with open('hamont_election_results.json') as file:
output = json.loads(file.read())
rows = []
results = output['results']
for r in results:
this_result = "(%s\t%s\t%s\t%s\t%s\t%s\%s\t%s)" % (
r['poll'], r['poll_name'], r['ward'], r['election'], r['name'], r['votes'], r['registered_voters'], r['web_page']
)
rows.append(this_result)
with open('hamont_election_results.tsv', 'w') as file:
file.write('\n'.join(rows))
print "tsv saved"
def make_sql():
"""
Takes the JSON output from make_output() and converts it into an
SQL insert statement.
"""
with open('hamont_election_results.json') as file:
output = json.loads(file.read())
# create SQL insert statement
statement = []
statement.append('insert into election_results')
statement.append('(poll, poll_name, ward, election, name, votes, registered_voters, web_page)')
statement.append('values')
rows = []
results = output['results']
for r in results:
this_result = "(%s, '%s', %s, '%s', '%s', %s, %s, '%s')" % (
r['poll'], r['poll_name'].replace("'", "\'"), r['ward'], r['election'], r['name'].replace("'", "\'"), r['votes'], r['registered_voters'], r['web_page']
)
rows.append(this_result)
all_rows = ', '.join(rows)
statement.append(all_rows)
with open('hamont_election_results.sql', 'w') as file:
file.write(' '.join(statement))
print "sql saved"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.