Created
February 16, 2021 10:58
-
-
Save carbonphyber/290a9573b1e9357f348863fe1bd837f0 to your computer and use it in GitHub Desktop.
OpenElection XML->CSV converter for Santa Clara County, California
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# File for reading a county XML and exporting to OpenElections CSV format | |
# The expected input file was | |
# - downloaded from: https://results.enr.clarityelections.com//CA/Santa_Clara/106043/272625/reports/detailxml.zip | |
# - linked from: https://results.enr.clarityelections.com/CA/Santa_Clara/106043/web.264614/#/summary | |
# Some Python code borrowed from: | |
# - https://www.geeksforgeeks.org/xml-parsing-python/ | |
import csv | |
import json | |
import sys | |
import xml.etree.ElementTree as ET | |
DEBUG = False | |
# detect if the contest is the Presidential race from testing the name string | |
def is_contest_president(contest_name): | |
return True if contest_name == 'President and Vice President' else False | |
# detect if the contest is a Congressional race from testing the name string | |
def is_contest_congress_house(contest_name): | |
# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long | |
return True if contest_name[-14:] == ' Congressional' else False | |
# returns the district number of a congressional district race | |
def get_congress_house_district(contest_name): | |
if not is_contest_congress_house(contest_name): | |
raise Exception('Not a Congressional race') | |
return int(contest_name[:-16]) | |
# detect if the contest is a state assembly race from testing the name string | |
def is_contest_state_assembly(contest_name): | |
# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long | |
return True if contest_name[-9:] == ' Assembly' else False | |
# detect if the contest is a state senate race from testing the name string | |
def is_contest_state_senate(contest_name): | |
# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long | |
return True if contest_name[-13:] == ' State Senate' else False | |
# | |
def get_office(contest_name): | |
office = '' | |
if is_contest_president(contest_name): | |
office = 'President' | |
elif is_contest_congress_house(contest_name): | |
office = 'U.S. House' | |
elif is_contest_state_assembly(contest_name): | |
office = 'State Assembly' | |
elif is_contest_state_senate(contest_name): | |
office = 'State Senate' | |
# # no other races supported | |
# else: | |
# raise Exception('unsupported contest: ' + contest_name) | |
return office | |
# | |
def get_district(contest_name, office): | |
is_district_office = is_contest_congress_house(office) or is_contest_state_assembly(office) or is_contest_state_senate(office) | |
return | |
# Normalize the ticket name of a presidential line to the same string used in other county CSV files | |
def get_normalized_president_name(ticket_text): | |
if ticket_text == 'JOSEPH R. BIDEN / KAMALA D. HARRIS': | |
return 'Joe Biden' | |
elif ticket_text == 'DONALD J. TRUMP / MICHAEL R. PENCE': | |
return 'Donald Trump' | |
elif ticket_text == 'GLORIA LA RIVA / SUNIL FREEMAN': | |
return 'Gloria LaRiva' | |
elif ticket_text == 'ROQUE "ROCKY" DE LA FUENTE GUERRA / KANYE OMARI WEST': | |
return 'Rocky de la Fuente Guerra' | |
elif ticket_text == 'HOWIE HAWKINS / ANGELA NICOLE WALKER': | |
return 'Howie Hawkins' | |
elif ticket_text == 'JO JORGENSEN / JEREMY "SPIKE" COHEN': | |
return 'Jo Jorgensen' | |
elif ticket_text == 'BRIAN CARROLL / AMAR PATEL': | |
return 'Brian Carroll' | |
elif ticket_text == 'JESSE VENTURA / CYNTHIA MCKINNEY': | |
return 'Jesse Ventura' | |
elif ticket_text == 'MARK CHARLES / ADRIAN WALLACE': | |
return 'Mark Charles' | |
elif ticket_text == 'JOSEPH KISHORE / NORISSA SANTA CRUZ': | |
return 'Joseph Kishore' | |
elif ticket_text == 'BROCK PIERCE / KARLA BALLARD': | |
return 'Brock Pierce' | |
else: | |
raise Exception('Unknown ticket: ' + ticket_text) | |
# Normalize the ticket name of a presidential line to the same string used in other county CSV files | |
def get_normalized_congress_house_name(ticket_text): | |
if ticket_text == 'RO KHANNA': | |
return 'Ro Khanna' | |
elif ticket_text == 'RITESH TANDON': | |
return 'Ritesh Tandon' | |
elif ticket_text == 'ANNA G. ESHOO': | |
return 'Anna G. Eshoo' | |
elif ticket_text == 'RISHI KUMAR': | |
return 'Rishi Kumar' | |
elif ticket_text == 'ZOE LOFGREN': | |
return 'Zoe Lofgren' | |
elif ticket_text == 'JUSTIN JAMES AGUILERA': | |
return 'Justin James Aguilera' | |
elif ticket_text == 'JIMMY PANETTA': | |
return 'Jimmy Panetta' | |
elif ticket_text == 'JEFF GORMAN': | |
return 'Jeff Gorman' | |
else: | |
raise Exception('Unknown ticket: ' + ticket_text) | |
def parseXML(xmlfile): | |
# create element tree object | |
tree = ET.parse(xmlfile) | |
root = tree.getroot() | |
# hard-coded. | |
county = 'Santa Clara' | |
# create empty list for news items | |
precinct_vote_items = [] | |
# Traverse the XML tree -- lots of schema-specific and data-specific code in the rest of this function | |
# Note that array indexes are brittle in case other counties have different numbers of races (like Congressional districts) | |
for voter_turnout in root[4:5]: | |
for precinct_votes in voter_turnout[0]: | |
precinct_name = precinct_votes.attrib['name'] | |
precinct_vote_items.append({ | |
'county': county, | |
'precinct': precinct_name, | |
'office': 'Registered Voters', | |
'district': '', | |
'party': '', | |
'candidate': '', | |
'votes': precinct_votes.attrib['totalVoters'], | |
'election_day': '', | |
'mail': '', | |
}) | |
for precinct_votes in voter_turnout[0]: | |
precinct_name = precinct_votes.attrib['name'] | |
# note that these precinct-vote-count records are not broken down by election_day / mail in this XML input file | |
precinct_vote_items.append({ | |
'county': county, | |
'precinct': precinct_name, | |
'office': 'Ballots Cast', | |
'district': '', | |
'party': '', | |
'candidate': '', | |
'votes': precinct_votes.attrib['ballotsCast'], | |
'election_day': '', | |
'mail': '', | |
}) | |
# iterate through the contests | |
for contest in root[5:]: | |
contest_name = contest.attrib['text'] | |
# standardize office string | |
office = get_office(contest_name) | |
if not office: | |
# skip this iteration unless it is in the list of supported offices | |
continue | |
if DEBUG: | |
print('contest name: ' + contest_name) | |
for choice in contest[2:]: | |
party = '' | |
# robust against missing party attrib in XML tag | |
if "party" in choice.attrib: | |
party = choice.attrib["party"] | |
ticket = '' | |
if "text" in choice.attrib: | |
ticket = choice.attrib["text"] | |
candidate = ticket | |
if office == 'President': | |
candidate = get_normalized_president_name(ticket) | |
elif office == 'U.S. House': | |
candidate = get_normalized_congress_house_name(ticket) | |
# This dict will be keyed on precinct name and the value will be each row in the final CSV for this candidate | |
precincts_of_this_candidate = {} | |
if DEBUG: | |
print(choice.tag, choice.attrib) | |
if DEBUG: | |
print(choice[0].tag, choice[0].attrib) | |
# traverse the data structure for <VoteType name="Election Day" ...> | |
for votetype_election_day in choice[0]: | |
if DEBUG: | |
print(votetype_election_day.tag, votetype_election_day.attrib) | |
precinct_name = votetype_election_day.attrib['name'] | |
if precinct_name not in precincts_of_this_candidate: | |
precincts_of_this_candidate[precinct_name] = { | |
'county': county, | |
'precinct': precinct_name, | |
'office': office, | |
'district': get_district(contest_name, office), | |
'party': party, | |
'candidate': candidate, | |
'votes': 0, | |
'election_day': 0, | |
'mail': 0, | |
} | |
this_precinct_election_day_votes = int(votetype_election_day.attrib['votes'] if 'votes' in votetype_election_day.attrib else 0) | |
precincts_of_this_candidate[precinct_name]['election_day'] = this_precinct_election_day_votes | |
precincts_of_this_candidate[precinct_name]['votes'] += this_precinct_election_day_votes | |
if DEBUG: | |
print(choice[1].tag, choice[1].attrib) | |
# traverse the data structure for <VoteType name="Vote By Mail" ...> | |
for votetype_mail in choice[1]: | |
if DEBUG: | |
print(votetype_mail.tag, votetype_mail.attrib) | |
precinct_name = votetype_mail.attrib['name'] | |
if precinct_name not in precincts_of_this_candidate: | |
precincts_of_this_candidate[precinct_name] = { | |
'county': county, | |
'precinct': precinct_name, | |
'office': office, | |
'district': get_district(contest_name, office), | |
'party': party, | |
'candidate': candidate, | |
'votes': 0, | |
'election_day': 0, | |
'mail': 0, | |
} | |
this_precinct_mail_votes = int(votetype_mail.attrib['votes'] if 'votes' in votetype_mail.attrib else 0) | |
precincts_of_this_candidate[precinct_name]['mail'] = this_precinct_mail_votes | |
precincts_of_this_candidate[precinct_name]['votes'] += this_precinct_mail_votes | |
for precinct_name in precincts_of_this_candidate.keys(): | |
if DEBUG: | |
print(json.dumps(precincts_of_this_candidate[precinct_name], indent=4, sort_keys=True)) | |
# assemble into final array | |
precinct_vote_items.append(precincts_of_this_candidate[precinct_name]) | |
return precinct_vote_items | |
def savetoCSV(newsitems, filename): | |
# define csv headers/fields | |
fields = [ | |
'county', | |
'precinct', | |
'office', | |
'district', | |
'party', | |
'candidate', | |
'votes', | |
'election_day', | |
'mail', | |
] | |
# Write to csv file | |
# Python module docs: https://docs.python.org/3/library/csv.html | |
with open(filename, 'w') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames = fields, quoting = csv.QUOTE_MINIMAL) | |
writer.writeheader() | |
writer.writerows(newsitems) | |
def main(filename_in, filename_out): | |
# parse xml file | |
precinct_vote_items = parseXML(filename_in) | |
# store news items in a csv file | |
savetoCSV(precinct_vote_items, filename_out) | |
# Run with CLI: | |
# $ python3 ./read_county_xml.py input.xml ./2020/20201103__ca__general__santa_clara__precinct.csv | |
if __name__ == "__main__": | |
if len(sys.argv) < 3: | |
print('bad arguments') | |
else: | |
filename_in = sys.argv[1] | |
filename_out = sys.argv[2] | |
main(filename_in, filename_out) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment