Nasty hack to get city of #HamOnt election data into a usable format.
#!/usr/bin/env python | |
""" | |
Grabs election data from the City of Hamilton's GEMS election data pages, converts it into a json object and an sql insert statement. | |
Yeah, I know: the code is seriously ugly-ass and inelegant. What the hell, it works. | |
Update: The City changed their election results pages and this script no longer works. I'm working on updating it. | |
Very Important Note: this code breaks on ward 14, where Rob Pasuta is acclaimed and there are no ward results on the web page. | |
Instead of fixing it in the code, for now I just fixed the SQL statement manually by removing the trustees from the results. | |
TODO: go back and update the code to account for wards where a candidate is acclaimed. | |
""" | |
import urllib2 | |
import re | |
try: | |
import json | |
except: | |
import simplejson as json | |
proxy_handler = urllib2.ProxyHandler({}) | |
opener = urllib2.build_opener(proxy_handler) | |
urllib2.install_opener(opener) | |
url_template = 'http://old.hamilton.ca/clerk/election/2010-election-results/PollByPoll/results-[[X]].htm' | |
def make_output(): | |
output = {} # create output dict that will be converted into a JSON object | |
output["ok"] = "true" | |
output["results"] = [] | |
# 207 polls | |
pages = xrange(1,208) | |
pages = xrange(1,11) # for testing | |
for page in pages: | |
url = url_template.replace('[[X]]', '%s' % (page)) | |
request = urllib2.Request(url) | |
response = urllib2.urlopen(request) | |
content = response.read() | |
# remove white space | |
content = '\n'.join([line.strip() for line in content.split('\n')]) | |
# print content | |
# get name | |
poll_name = '' | |
pattern = '<br>Summary For [0-9]{3,} - [a-zA-Z0-9.\',\- ()/@]+<br>' | |
match = re.search(pattern, content) | |
if match: | |
match_text = content[match.span()[0]:match.span()[1]] | |
#print match_text | |
match_text = match_text.replace('<br>Summary For ', '') | |
match_text = match_text.replace(', All Counters, All Races<br>', '') | |
poll_name = match_text | |
else: | |
print "Poll Name not found for poll %s" % (page) | |
# get registered voter count | |
registered_voter_count = 0 | |
pattern = '<td>Registered Voters [0-9]+' | |
match = re.search(pattern, content) | |
if match: | |
match_text = content[match.span()[0]:match.span()[1]] | |
#print match_text | |
match_text = match_text.replace('<td>Registered Voters ', '').replace('</td>', '') | |
try: | |
registered_voter_count = int(match_text) | |
except: | |
pass | |
else: | |
print "Registered Voters not found for poll %s" % (page) | |
# get ward | |
ward = 0 | |
pattern = '<th align=left>COUNCILLOR WARD [0-9]+</th>' | |
match = re.search(pattern, content) | |
if match: | |
match_text = content[match.span()[0]:match.span()[1]] | |
match_text = match_text.replace('<th align=left>COUNCILLOR WARD ', '').replace('</th>', '') | |
try: | |
ward = int(match_text) | |
except: | |
pass | |
# get candidate numbers on same line with tab separation | |
pattern = '</td>\n<td align=(right|left)>' | |
content2 = re.sub(pattern, '\t', content) | |
# remove HTML tags | |
pattern = '(\<(/?[^\>]+)\>)' | |
content3 = re.sub(pattern, '', content2) | |
# flag start of mayoral candidates | |
pattern = """MAYOR | |
Total | |
Number of Polls | |
1 | |
Polls Reporting | |
1 100.0 %""" | |
content3 = content3.replace(pattern, '[[MAYOR STARTS]]') | |
# flag end of mayoral candidates and start of council candidates | |
pattern = """ | |
COUNCILLOR WARD [0-9]+ | |
Total | |
Number of Polls | |
1 | |
Polls Reporting | |
1 100.0 %""" | |
content3 = re.sub(pattern, '[[COUNCIL STARTS]]', content3) | |
# get rid of multiple tabs | |
content3 = content3.replace('\t\t', '\t') | |
print content3 # testing | |
#start walking the rows to append the values to output['results'] | |
which = '' | |
for row in content3.split('\n'): | |
# exit if finished appending mayoral and council candidates | |
if row.strip() == '' and which == 'Ward %s' % (ward): | |
break | |
if row.strip() == '[[MAYOR STARTS]]': | |
which = 'Mayor' | |
elif row.strip() == '[[COUNCIL STARTS]]': | |
which = 'Ward %s' % (ward) | |
if which != '' and '\t' in row.strip(): | |
fields = row.split('\t') | |
this_dict = { | |
"poll": page, | |
"poll_name": poll_name, | |
"registered_voters": registered_voter_count, | |
"ward": ward, | |
"election": which, | |
"web_page": url, | |
"name": "", | |
"votes": "", | |
} | |
try: this_dict["name"] = fields[0] | |
except: pass | |
try: this_dict["votes"] = int(fields[1]) | |
except: pass | |
output['results'].append(this_dict) | |
print 'Poll %s Complete' % (page) | |
# save json object | |
with open('hamont_election_results.json', 'w') as file: | |
file.write(json.dumps(output)) | |
print "json saved" | |
def make_tab(): | |
""" | |
Takes the JSON output from make_output() and converts it into an | |
SQL insert statement. | |
""" | |
with open('hamont_election_results.json') as file: | |
output = json.loads(file.read()) | |
rows = [] | |
results = output['results'] | |
for r in results: | |
this_result = "(%s\t%s\t%s\t%s\t%s\t%s\%s\t%s)" % ( | |
r['poll'], r['poll_name'], r['ward'], r['election'], r['name'], r['votes'], r['registered_voters'], r['web_page'] | |
) | |
rows.append(this_result) | |
with open('hamont_election_results.tsv', 'w') as file: | |
file.write('\n'.join(rows)) | |
print "tsv saved" | |
def make_sql(): | |
""" | |
Takes the JSON output from make_output() and converts it into an | |
SQL insert statement. | |
""" | |
with open('hamont_election_results.json') as file: | |
output = json.loads(file.read()) | |
# create SQL insert statement | |
statement = [] | |
statement.append('insert into election_results') | |
statement.append('(poll, poll_name, ward, election, name, votes, registered_voters, web_page)') | |
statement.append('values') | |
rows = [] | |
results = output['results'] | |
for r in results: | |
this_result = "(%s, '%s', %s, '%s', '%s', %s, %s, '%s')" % ( | |
r['poll'], r['poll_name'].replace("'", "\'"), r['ward'], r['election'], r['name'].replace("'", "\'"), r['votes'], r['registered_voters'], r['web_page'] | |
) | |
rows.append(this_result) | |
all_rows = ', '.join(rows) | |
statement.append(all_rows) | |
with open('hamont_election_results.sql', 'w') as file: | |
file.write(' '.join(statement)) | |
print "sql saved" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment