Created
October 27, 2010 16:20
-
-
Save quandyfactory/649368 to your computer and use it in GitHub Desktop.
Nasty hack to get city of #HamOnt election data into a usable format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Grabs election data from the City of Hamilton's GEMS election data pages, converts it into a json object and an sql insert statement. | |
Yeah, I know: the code is seriously ugly-ass and inelegant. What the hell, it works. | |
Update: The City changed their election results pages and this script no longer works. I'm working on updating it. | |
Very Important Note: this code breaks on ward 14, where Rob Pasuta is acclaimed and there are no ward results on the web page. | |
Instead of fixing it in the code, for now I just fixed the SQL statement manually by removing the trustees from the results. | |
TODO: go back and update the code to account for wards where a candidate is acclaimed. | |
""" | |
import urllib2 | |
import re | |
try: | |
import json | |
except: | |
import simplejson as json | |
proxy_handler = urllib2.ProxyHandler({}) | |
opener = urllib2.build_opener(proxy_handler) | |
urllib2.install_opener(opener) | |
url_template = 'http://old.hamilton.ca/clerk/election/2010-election-results/PollByPoll/results-[[X]].htm' | |
def make_output(): | |
output = {} # create output dict that will be converted into a JSON object | |
output["ok"] = "true" | |
output["results"] = [] | |
# 207 polls | |
pages = xrange(1,208) | |
pages = xrange(1,11) # for testing | |
for page in pages: | |
url = url_template.replace('[[X]]', '%s' % (page)) | |
request = urllib2.Request(url) | |
response = urllib2.urlopen(request) | |
content = response.read() | |
# remove white space | |
content = '\n'.join([line.strip() for line in content.split('\n')]) | |
# print content | |
# get name | |
poll_name = '' | |
pattern = '<br>Summary For [0-9]{3,} - [a-zA-Z0-9.\',\- ()/@]+<br>' | |
match = re.search(pattern, content) | |
if match: | |
match_text = content[match.span()[0]:match.span()[1]] | |
#print match_text | |
match_text = match_text.replace('<br>Summary For ', '') | |
match_text = match_text.replace(', All Counters, All Races<br>', '') | |
poll_name = match_text | |
else: | |
print "Poll Name not found for poll %s" % (page) | |
# get registered voter count | |
registered_voter_count = 0 | |
pattern = '<td>Registered Voters [0-9]+' | |
match = re.search(pattern, content) | |
if match: | |
match_text = content[match.span()[0]:match.span()[1]] | |
#print match_text | |
match_text = match_text.replace('<td>Registered Voters ', '').replace('</td>', '') | |
try: | |
registered_voter_count = int(match_text) | |
except: | |
pass | |
else: | |
print "Registered Voters not found for poll %s" % (page) | |
# get ward | |
ward = 0 | |
pattern = '<th align=left>COUNCILLOR WARD [0-9]+</th>' | |
match = re.search(pattern, content) | |
if match: | |
match_text = content[match.span()[0]:match.span()[1]] | |
match_text = match_text.replace('<th align=left>COUNCILLOR WARD ', '').replace('</th>', '') | |
try: | |
ward = int(match_text) | |
except: | |
pass | |
# get candidate numbers on same line with tab separation | |
pattern = '</td>\n<td align=(right|left)>' | |
content2 = re.sub(pattern, '\t', content) | |
# remove HTML tags | |
pattern = '(\<(/?[^\>]+)\>)' | |
content3 = re.sub(pattern, '', content2) | |
# flag start of mayoral candidates | |
pattern = """MAYOR | |
Total | |
Number of Polls | |
1 | |
Polls Reporting | |
1 100.0 %""" | |
content3 = content3.replace(pattern, '[[MAYOR STARTS]]') | |
# flag end of mayoral candidates and start of council candidates | |
pattern = """ | |
COUNCILLOR WARD [0-9]+ | |
Total | |
Number of Polls | |
1 | |
Polls Reporting | |
1 100.0 %""" | |
content3 = re.sub(pattern, '[[COUNCIL STARTS]]', content3) | |
# get rid of multiple tabs | |
content3 = content3.replace('\t\t', '\t') | |
print content3 # testing | |
#start walking the rows to append the values to output['results'] | |
which = '' | |
for row in content3.split('\n'): | |
# exit if finished appending mayoral and council candidates | |
if row.strip() == '' and which == 'Ward %s' % (ward): | |
break | |
if row.strip() == '[[MAYOR STARTS]]': | |
which = 'Mayor' | |
elif row.strip() == '[[COUNCIL STARTS]]': | |
which = 'Ward %s' % (ward) | |
if which != '' and '\t' in row.strip(): | |
fields = row.split('\t') | |
this_dict = { | |
"poll": page, | |
"poll_name": poll_name, | |
"registered_voters": registered_voter_count, | |
"ward": ward, | |
"election": which, | |
"web_page": url, | |
"name": "", | |
"votes": "", | |
} | |
try: this_dict["name"] = fields[0] | |
except: pass | |
try: this_dict["votes"] = int(fields[1]) | |
except: pass | |
output['results'].append(this_dict) | |
print 'Poll %s Complete' % (page) | |
# save json object | |
with open('hamont_election_results.json', 'w') as file: | |
file.write(json.dumps(output)) | |
print "json saved" | |
def make_tab(): | |
""" | |
Takes the JSON output from make_output() and converts it into an | |
SQL insert statement. | |
""" | |
with open('hamont_election_results.json') as file: | |
output = json.loads(file.read()) | |
rows = [] | |
results = output['results'] | |
for r in results: | |
this_result = "(%s\t%s\t%s\t%s\t%s\t%s\%s\t%s)" % ( | |
r['poll'], r['poll_name'], r['ward'], r['election'], r['name'], r['votes'], r['registered_voters'], r['web_page'] | |
) | |
rows.append(this_result) | |
with open('hamont_election_results.tsv', 'w') as file: | |
file.write('\n'.join(rows)) | |
print "tsv saved" | |
def make_sql(): | |
""" | |
Takes the JSON output from make_output() and converts it into an | |
SQL insert statement. | |
""" | |
with open('hamont_election_results.json') as file: | |
output = json.loads(file.read()) | |
# create SQL insert statement | |
statement = [] | |
statement.append('insert into election_results') | |
statement.append('(poll, poll_name, ward, election, name, votes, registered_voters, web_page)') | |
statement.append('values') | |
rows = [] | |
results = output['results'] | |
for r in results: | |
this_result = "(%s, '%s', %s, '%s', '%s', %s, %s, '%s')" % ( | |
r['poll'], r['poll_name'].replace("'", "\'"), r['ward'], r['election'], r['name'].replace("'", "\'"), r['votes'], r['registered_voters'], r['web_page'] | |
) | |
rows.append(this_result) | |
all_rows = ', '.join(rows) | |
statement.append(all_rows) | |
with open('hamont_election_results.sql', 'w') as file: | |
file.write(' '.join(statement)) | |
print "sql saved" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment