Skip to content

Instantly share code, notes, and snippets.

@carbonphyber
Created February 16, 2021 10:58
Show Gist options
  • Save carbonphyber/290a9573b1e9357f348863fe1bd837f0 to your computer and use it in GitHub Desktop.
Save carbonphyber/290a9573b1e9357f348863fe1bd837f0 to your computer and use it in GitHub Desktop.
OpenElection XML->CSV converter for Santa Clara County, California
# File for reading a county XML and exporting to OpenElections CSV format
# The expected input file was
# - downloaded from: https://results.enr.clarityelections.com//CA/Santa_Clara/106043/272625/reports/detailxml.zip
# - linked from: https://results.enr.clarityelections.com/CA/Santa_Clara/106043/web.264614/#/summary
# Some Python code borrowed from:
# - https://www.geeksforgeeks.org/xml-parsing-python/
import csv
import json
import sys
import xml.etree.ElementTree as ET
DEBUG = False
# detect if the contest is the Presidential race from testing the name string
def is_contest_president(contest_name):
return True if contest_name == 'President and Vice President' else False
# detect if the contest is a Congressional race from testing the name string
def is_contest_congress_house(contest_name):
# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
return True if contest_name[-14:] == ' Congressional' else False
# returns the district number of a congressional district race
def get_congress_house_district(contest_name):
if not is_contest_congress_house(contest_name):
raise Exception('Not a Congressional race')
return int(contest_name[:-16])
# detect if the contest is a state assembly race from testing the name string
def is_contest_state_assembly(contest_name):
# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
return True if contest_name[-9:] == ' Assembly' else False
# detect if the contest is a state senate race from testing the name string
def is_contest_state_senate(contest_name):
# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
return True if contest_name[-13:] == ' State Senate' else False
#
def get_office(contest_name):
office = ''
if is_contest_president(contest_name):
office = 'President'
elif is_contest_congress_house(contest_name):
office = 'U.S. House'
elif is_contest_state_assembly(contest_name):
office = 'State Assembly'
elif is_contest_state_senate(contest_name):
office = 'State Senate'
# # no other races supported
# else:
# raise Exception('unsupported contest: ' + contest_name)
return office
#
def get_district(contest_name, office):
is_district_office = is_contest_congress_house(office) or is_contest_state_assembly(office) or is_contest_state_senate(office)
return
# Normalize the ticket name of a presidential line to the same string used in other county CSV files
def get_normalized_president_name(ticket_text):
if ticket_text == 'JOSEPH R. BIDEN / KAMALA D. HARRIS':
return 'Joe Biden'
elif ticket_text == 'DONALD J. TRUMP / MICHAEL R. PENCE':
return 'Donald Trump'
elif ticket_text == 'GLORIA LA RIVA / SUNIL FREEMAN':
return 'Gloria LaRiva'
elif ticket_text == 'ROQUE "ROCKY" DE LA FUENTE GUERRA / KANYE OMARI WEST':
return 'Rocky de la Fuente Guerra'
elif ticket_text == 'HOWIE HAWKINS / ANGELA NICOLE WALKER':
return 'Howie Hawkins'
elif ticket_text == 'JO JORGENSEN / JEREMY "SPIKE" COHEN':
return 'Jo Jorgensen'
elif ticket_text == 'BRIAN CARROLL / AMAR PATEL':
return 'Brian Carroll'
elif ticket_text == 'JESSE VENTURA / CYNTHIA MCKINNEY':
return 'Jesse Ventura'
elif ticket_text == 'MARK CHARLES / ADRIAN WALLACE':
return 'Mark Charles'
elif ticket_text == 'JOSEPH KISHORE / NORISSA SANTA CRUZ':
return 'Joseph Kishore'
elif ticket_text == 'BROCK PIERCE / KARLA BALLARD':
return 'Brock Pierce'
else:
raise Exception('Unknown ticket: ' + ticket_text)
# Normalize the ticket name of a presidential line to the same string used in other county CSV files
def get_normalized_congress_house_name(ticket_text):
if ticket_text == 'RO KHANNA':
return 'Ro Khanna'
elif ticket_text == 'RITESH TANDON':
return 'Ritesh Tandon'
elif ticket_text == 'ANNA G. ESHOO':
return 'Anna G. Eshoo'
elif ticket_text == 'RISHI KUMAR':
return 'Rishi Kumar'
elif ticket_text == 'ZOE LOFGREN':
return 'Zoe Lofgren'
elif ticket_text == 'JUSTIN JAMES AGUILERA':
return 'Justin James Aguilera'
elif ticket_text == 'JIMMY PANETTA':
return 'Jimmy Panetta'
elif ticket_text == 'JEFF GORMAN':
return 'Jeff Gorman'
else:
raise Exception('Unknown ticket: ' + ticket_text)
def parseXML(xmlfile):
# create element tree object
tree = ET.parse(xmlfile)
root = tree.getroot()
# hard-coded.
county = 'Santa Clara'
# create empty list for news items
precinct_vote_items = []
# Traverse the XML tree -- lots of schema-specific and data-specific code in the rest of this function
# Note that array indexes are brittle in case other counties have different numbers of races (like Congressional districts)
for voter_turnout in root[4:5]:
for precinct_votes in voter_turnout[0]:
precinct_name = precinct_votes.attrib['name']
precinct_vote_items.append({
'county': county,
'precinct': precinct_name,
'office': 'Registered Voters',
'district': '',
'party': '',
'candidate': '',
'votes': precinct_votes.attrib['totalVoters'],
'election_day': '',
'mail': '',
})
for precinct_votes in voter_turnout[0]:
precinct_name = precinct_votes.attrib['name']
# note that these precinct-vote-count records are not broken down by election_day / mail in this XML input file
precinct_vote_items.append({
'county': county,
'precinct': precinct_name,
'office': 'Ballots Cast',
'district': '',
'party': '',
'candidate': '',
'votes': precinct_votes.attrib['ballotsCast'],
'election_day': '',
'mail': '',
})
# iterate through the contests
for contest in root[5:]:
contest_name = contest.attrib['text']
# standardize office string
office = get_office(contest_name)
if not office:
# skip this iteration unless it is in the list of supported offices
continue
if DEBUG:
print('contest name: ' + contest_name)
for choice in contest[2:]:
party = ''
# robust against missing party attrib in XML tag
if "party" in choice.attrib:
party = choice.attrib["party"]
ticket = ''
if "text" in choice.attrib:
ticket = choice.attrib["text"]
candidate = ticket
if office == 'President':
candidate = get_normalized_president_name(ticket)
elif office == 'U.S. House':
candidate = get_normalized_congress_house_name(ticket)
# This dict will be keyed on precinct name and the value will be each row in the final CSV for this candidate
precincts_of_this_candidate = {}
if DEBUG:
print(choice.tag, choice.attrib)
if DEBUG:
print(choice[0].tag, choice[0].attrib)
# traverse the data structure for <VoteType name="Election Day" ...>
for votetype_election_day in choice[0]:
if DEBUG:
print(votetype_election_day.tag, votetype_election_day.attrib)
precinct_name = votetype_election_day.attrib['name']
if precinct_name not in precincts_of_this_candidate:
precincts_of_this_candidate[precinct_name] = {
'county': county,
'precinct': precinct_name,
'office': office,
'district': get_district(contest_name, office),
'party': party,
'candidate': candidate,
'votes': 0,
'election_day': 0,
'mail': 0,
}
this_precinct_election_day_votes = int(votetype_election_day.attrib['votes'] if 'votes' in votetype_election_day.attrib else 0)
precincts_of_this_candidate[precinct_name]['election_day'] = this_precinct_election_day_votes
precincts_of_this_candidate[precinct_name]['votes'] += this_precinct_election_day_votes
if DEBUG:
print(choice[1].tag, choice[1].attrib)
# traverse the data structure for <VoteType name="Vote By Mail" ...>
for votetype_mail in choice[1]:
if DEBUG:
print(votetype_mail.tag, votetype_mail.attrib)
precinct_name = votetype_mail.attrib['name']
if precinct_name not in precincts_of_this_candidate:
precincts_of_this_candidate[precinct_name] = {
'county': county,
'precinct': precinct_name,
'office': office,
'district': get_district(contest_name, office),
'party': party,
'candidate': candidate,
'votes': 0,
'election_day': 0,
'mail': 0,
}
this_precinct_mail_votes = int(votetype_mail.attrib['votes'] if 'votes' in votetype_mail.attrib else 0)
precincts_of_this_candidate[precinct_name]['mail'] = this_precinct_mail_votes
precincts_of_this_candidate[precinct_name]['votes'] += this_precinct_mail_votes
for precinct_name in precincts_of_this_candidate.keys():
if DEBUG:
print(json.dumps(precincts_of_this_candidate[precinct_name], indent=4, sort_keys=True))
# assemble into final array
precinct_vote_items.append(precincts_of_this_candidate[precinct_name])
return precinct_vote_items
def savetoCSV(newsitems, filename):
# define csv headers/fields
fields = [
'county',
'precinct',
'office',
'district',
'party',
'candidate',
'votes',
'election_day',
'mail',
]
# Write to csv file
# Python module docs: https://docs.python.org/3/library/csv.html
with open(filename, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = fields, quoting = csv.QUOTE_MINIMAL)
writer.writeheader()
writer.writerows(newsitems)
def main(filename_in, filename_out):
# parse xml file
precinct_vote_items = parseXML(filename_in)
# store news items in a csv file
savetoCSV(precinct_vote_items, filename_out)
# Run with CLI:
# $ python3 ./read_county_xml.py input.xml ./2020/20201103__ca__general__santa_clara__precinct.csv
if __name__ == "__main__":
if len(sys.argv) < 3:
print('bad arguments')
else:
filename_in = sys.argv[1]
filename_out = sys.argv[2]
main(filename_in, filename_out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment