carbonphyber/read_county_xml.py

## read_county_xml.py
# File for reading a county XML and exporting to OpenElections CSV format
# The expected input file was
# - downloaded from: https://results.enr.clarityelections.com//CA/Santa_Clara/106043/272625/reports/detailxml.zip
# - linked from: https://results.enr.clarityelections.com/CA/Santa_Clara/106043/web.264614/#/summary
# Some Python code borrowed from:
# - https://www.geeksforgeeks.org/xml-parsing-python/

import csv
import json
import sys
import xml.etree.ElementTree as ET


DEBUG = False

# detect if the contest is the Presidential race from testing the name string
def is_contest_president(contest_name):
    return True if contest_name == 'President and Vice President' else False

# detect if the contest is a Congressional race from testing the name string
def is_contest_congress_house(contest_name):
    # ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
    return True if contest_name[-14:] == ' Congressional' else False

# returns the district number of a congressional district race
def get_congress_house_district(contest_name):
    if not is_contest_congress_house(contest_name):
        raise Exception('Not a Congressional race')
    return int(contest_name[:-16])

# detect if the contest is a state assembly race from testing the name string
def is_contest_state_assembly(contest_name):
    # ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
    return True if contest_name[-9:] == ' Assembly' else False

# detect if the contest is a state senate race from testing the name string
def is_contest_state_senate(contest_name):
    # ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
    return True if contest_name[-13:] == ' State Senate' else False

#
def get_office(contest_name):
    office = ''
    if is_contest_president(contest_name):
        office = 'President'
    elif is_contest_congress_house(contest_name):
        office = 'U.S. House'
    elif is_contest_state_assembly(contest_name):
        office = 'State Assembly'
    elif is_contest_state_senate(contest_name):
        office = 'State Senate'
    # # no other races supported
    # else:
    #     raise Exception('unsupported contest: ' + contest_name)
    return office

#
def get_district(contest_name, office):
    is_district_office = is_contest_congress_house(office) or is_contest_state_assembly(office) or is_contest_state_senate(office)
    return

# Normalize the ticket name of a presidential line to the same string used in other county CSV files
def get_normalized_president_name(ticket_text):
    if ticket_text == 'JOSEPH R. BIDEN / KAMALA D. HARRIS':
        return 'Joe Biden'
    elif ticket_text == 'DONALD J. TRUMP / MICHAEL R. PENCE':
        return 'Donald Trump'
    elif ticket_text == 'GLORIA LA RIVA / SUNIL FREEMAN':
        return 'Gloria LaRiva'
    elif ticket_text == 'ROQUE "ROCKY" DE LA FUENTE GUERRA / KANYE OMARI WEST':
        return 'Rocky de la Fuente Guerra'
    elif ticket_text == 'HOWIE HAWKINS / ANGELA NICOLE WALKER':
        return 'Howie Hawkins'
    elif ticket_text == 'JO JORGENSEN / JEREMY "SPIKE" COHEN':
        return 'Jo Jorgensen'
    elif ticket_text == 'BRIAN CARROLL / AMAR PATEL':
        return 'Brian Carroll'
    elif ticket_text == 'JESSE VENTURA / CYNTHIA MCKINNEY':
        return 'Jesse Ventura'
    elif ticket_text == 'MARK CHARLES / ADRIAN WALLACE':
        return 'Mark Charles'
    elif ticket_text == 'JOSEPH KISHORE / NORISSA SANTA CRUZ':
        return 'Joseph Kishore'
    elif ticket_text == 'BROCK PIERCE / KARLA BALLARD':
        return 'Brock Pierce'
    else:
        raise Exception('Unknown ticket: ' + ticket_text)

# Normalize the ticket name of a presidential line to the same string used in other county CSV files
def get_normalized_congress_house_name(ticket_text):
    if ticket_text == 'RO KHANNA':
        return 'Ro Khanna'
    elif ticket_text == 'RITESH TANDON':
        return 'Ritesh Tandon'
    elif ticket_text == 'ANNA G. ESHOO':
        return 'Anna G. Eshoo'
    elif ticket_text == 'RISHI KUMAR':
        return 'Rishi Kumar'
    elif ticket_text == 'ZOE LOFGREN':
        return 'Zoe Lofgren'
    elif ticket_text == 'JUSTIN JAMES AGUILERA':
        return 'Justin James Aguilera'
    elif ticket_text == 'JIMMY PANETTA':
        return 'Jimmy Panetta'
    elif ticket_text == 'JEFF GORMAN':
        return 'Jeff Gorman'
    else:
        raise Exception('Unknown ticket: ' + ticket_text)

def parseXML(xmlfile):
    # create element tree object
    tree = ET.parse(xmlfile)
    root = tree.getroot()

    # hard-coded.
    county = 'Santa Clara'

    # create empty list for news items
    precinct_vote_items = []

    # Traverse the XML tree -- lots of schema-specific and data-specific code in the rest of this function
    # Note that array indexes are brittle in case other counties have different numbers of races (like Congressional districts)
    for voter_turnout in root[4:5]:
        for precinct_votes in voter_turnout[0]:
            precinct_name = precinct_votes.attrib['name']
            precinct_vote_items.append({
              'county': county,
              'precinct': precinct_name,
              'office': 'Registered Voters',
              'district': '',
              'party': '',
              'candidate': '',
              'votes': precinct_votes.attrib['totalVoters'],
              'election_day': '',
              'mail': '',
            })
        for precinct_votes in voter_turnout[0]:
            precinct_name = precinct_votes.attrib['name']
            # note that these precinct-vote-count records are not broken down by election_day / mail in this XML input file
            precinct_vote_items.append({
              'county': county,
              'precinct': precinct_name,
              'office': 'Ballots Cast',
              'district': '',
              'party': '',
              'candidate': '',
              'votes': precinct_votes.attrib['ballotsCast'],
              'election_day': '',
              'mail': '',
            })


    # iterate through the contests
    for contest in root[5:]:
        contest_name = contest.attrib['text']
        # standardize office string
        office = get_office(contest_name)
        if not office:
            # skip this iteration unless it is in the list of supported offices
            continue
        if DEBUG:
            print('contest name: ' + contest_name)
        for choice in contest[2:]:
            party = ''
            # robust against missing party attrib in XML tag
            if "party" in choice.attrib:
                party = choice.attrib["party"]
            ticket = ''
            if "text" in choice.attrib:
                ticket = choice.attrib["text"]
            candidate = ticket
            if office == 'President':
                candidate = get_normalized_president_name(ticket)
            elif office == 'U.S. House':
                candidate = get_normalized_congress_house_name(ticket)

            # This dict will be keyed on precinct name and the value will be each row in the final CSV for this candidate
            precincts_of_this_candidate = {}

            if DEBUG:
                print(choice.tag, choice.attrib)

            if DEBUG:
                print(choice[0].tag, choice[0].attrib)
            # traverse the data structure for <VoteType name="Election Day" ...>
            for votetype_election_day in choice[0]:
                if DEBUG:
                    print(votetype_election_day.tag, votetype_election_day.attrib)
                precinct_name = votetype_election_day.attrib['name']
                if precinct_name not in precincts_of_this_candidate:
                    precincts_of_this_candidate[precinct_name] = {
                    'county': county,
                    'precinct': precinct_name,
                    'office': office,
                    'district': get_district(contest_name, office),
                    'party': party,
                    'candidate': candidate,
                    'votes': 0,
                    'election_day': 0,
                    'mail': 0,
                }
                this_precinct_election_day_votes = int(votetype_election_day.attrib['votes'] if 'votes' in votetype_election_day.attrib else 0)
                precincts_of_this_candidate[precinct_name]['election_day'] = this_precinct_election_day_votes
                precincts_of_this_candidate[precinct_name]['votes'] += this_precinct_election_day_votes

            if DEBUG:
                print(choice[1].tag, choice[1].attrib)
            # traverse the data structure for <VoteType name="Vote By Mail" ...>
            for votetype_mail in choice[1]:
                if DEBUG:
                    print(votetype_mail.tag, votetype_mail.attrib)
                precinct_name = votetype_mail.attrib['name']
                if precinct_name not in precincts_of_this_candidate:
                    precincts_of_this_candidate[precinct_name] = {
                    'county': county,
                    'precinct': precinct_name,
                    'office': office,
                    'district': get_district(contest_name, office),
                    'party': party,
                    'candidate': candidate,
                    'votes': 0,
                    'election_day': 0,
                    'mail': 0,
                }
                this_precinct_mail_votes = int(votetype_mail.attrib['votes'] if 'votes' in votetype_mail.attrib else 0)
                precincts_of_this_candidate[precinct_name]['mail'] = this_precinct_mail_votes
                precincts_of_this_candidate[precinct_name]['votes'] += this_precinct_mail_votes

            for precinct_name in precincts_of_this_candidate.keys():
                if DEBUG:
                    print(json.dumps(precincts_of_this_candidate[precinct_name], indent=4, sort_keys=True))
                # assemble into final array
                precinct_vote_items.append(precincts_of_this_candidate[precinct_name])

    return precinct_vote_items

def savetoCSV(newsitems, filename):
    # define csv headers/fields
    fields = [
        'county',
        'precinct',
        'office',
        'district',
        'party',
        'candidate',
        'votes',
        'election_day',
        'mail',
    ]

    # Write to csv file
    # Python module docs: https://docs.python.org/3/library/csv.html
    with open(filename, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames = fields, quoting = csv.QUOTE_MINIMAL)
        writer.writeheader()
        writer.writerows(newsitems)


def main(filename_in, filename_out):
    # parse xml file
    precinct_vote_items = parseXML(filename_in)

    # store news items in a csv file
    savetoCSV(precinct_vote_items, filename_out)

# Run with CLI:
# $ python3 ./read_county_xml.py input.xml ./2020/20201103__ca__general__santa_clara__precinct.csv
if __name__ == "__main__":
    if len(sys.argv) < 3:
        print('bad arguments')
    else:
        filename_in = sys.argv[1]
        filename_out = sys.argv[2]
        main(filename_in, filename_out)
	# File for reading a county XML and exporting to OpenElections CSV format
	# The expected input file was
	# - downloaded from: https://results.enr.clarityelections.com//CA/Santa_Clara/106043/272625/reports/detailxml.zip
	# - linked from: https://results.enr.clarityelections.com/CA/Santa_Clara/106043/web.264614/#/summary
	# Some Python code borrowed from:
	# - https://www.geeksforgeeks.org/xml-parsing-python/

	import csv
	import json
	import sys
	import xml.etree.ElementTree as ET


	DEBUG = False

	# detect if the contest is the Presidential race from testing the name string
	def is_contest_president(contest_name):
	return True if contest_name == 'President and Vice President' else False

	# detect if the contest is a Congressional race from testing the name string
	def is_contest_congress_house(contest_name):
	# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
	return True if contest_name[-14:] == ' Congressional' else False

	# returns the district number of a congressional district race
	def get_congress_house_district(contest_name):
	if not is_contest_congress_house(contest_name):
	raise Exception('Not a Congressional race')
	return int(contest_name[:-16])

	# detect if the contest is a state assembly race from testing the name string
	def is_contest_state_assembly(contest_name):
	# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
	return True if contest_name[-9:] == ' Assembly' else False

	# detect if the contest is a state senate race from testing the name string
	def is_contest_state_senate(contest_name):
	# ignoring "th" or "st" number suffix, but assuming they always exist and are 2 chars long
	return True if contest_name[-13:] == ' State Senate' else False

	#
	def get_office(contest_name):
	office = ''
	if is_contest_president(contest_name):
	office = 'President'
	elif is_contest_congress_house(contest_name):
	office = 'U.S. House'
	elif is_contest_state_assembly(contest_name):
	office = 'State Assembly'
	elif is_contest_state_senate(contest_name):
	office = 'State Senate'
	# # no other races supported
	# else:
	# raise Exception('unsupported contest: ' + contest_name)
	return office

	#
	def get_district(contest_name, office):
	is_district_office = is_contest_congress_house(office) or is_contest_state_assembly(office) or is_contest_state_senate(office)
	return

	# Normalize the ticket name of a presidential line to the same string used in other county CSV files
	def get_normalized_president_name(ticket_text):
	if ticket_text == 'JOSEPH R. BIDEN / KAMALA D. HARRIS':
	return 'Joe Biden'
	elif ticket_text == 'DONALD J. TRUMP / MICHAEL R. PENCE':
	return 'Donald Trump'
	elif ticket_text == 'GLORIA LA RIVA / SUNIL FREEMAN':
	return 'Gloria LaRiva'
	elif ticket_text == 'ROQUE "ROCKY" DE LA FUENTE GUERRA / KANYE OMARI WEST':
	return 'Rocky de la Fuente Guerra'
	elif ticket_text == 'HOWIE HAWKINS / ANGELA NICOLE WALKER':
	return 'Howie Hawkins'
	elif ticket_text == 'JO JORGENSEN / JEREMY "SPIKE" COHEN':
	return 'Jo Jorgensen'
	elif ticket_text == 'BRIAN CARROLL / AMAR PATEL':
	return 'Brian Carroll'
	elif ticket_text == 'JESSE VENTURA / CYNTHIA MCKINNEY':
	return 'Jesse Ventura'
	elif ticket_text == 'MARK CHARLES / ADRIAN WALLACE':
	return 'Mark Charles'
	elif ticket_text == 'JOSEPH KISHORE / NORISSA SANTA CRUZ':
	return 'Joseph Kishore'
	elif ticket_text == 'BROCK PIERCE / KARLA BALLARD':
	return 'Brock Pierce'
	else:
	raise Exception('Unknown ticket: ' + ticket_text)

	# Normalize the ticket name of a presidential line to the same string used in other county CSV files
	def get_normalized_congress_house_name(ticket_text):
	if ticket_text == 'RO KHANNA':
	return 'Ro Khanna'
	elif ticket_text == 'RITESH TANDON':
	return 'Ritesh Tandon'
	elif ticket_text == 'ANNA G. ESHOO':
	return 'Anna G. Eshoo'
	elif ticket_text == 'RISHI KUMAR':
	return 'Rishi Kumar'
	elif ticket_text == 'ZOE LOFGREN':
	return 'Zoe Lofgren'
	elif ticket_text == 'JUSTIN JAMES AGUILERA':
	return 'Justin James Aguilera'
	elif ticket_text == 'JIMMY PANETTA':
	return 'Jimmy Panetta'
	elif ticket_text == 'JEFF GORMAN':
	return 'Jeff Gorman'
	else:
	raise Exception('Unknown ticket: ' + ticket_text)

	def parseXML(xmlfile):
	# create element tree object
	tree = ET.parse(xmlfile)
	root = tree.getroot()

	# hard-coded.
	county = 'Santa Clara'

	# create empty list for news items
	precinct_vote_items = []

	# Traverse the XML tree -- lots of schema-specific and data-specific code in the rest of this function
	# Note that array indexes are brittle in case other counties have different numbers of races (like Congressional districts)
	for voter_turnout in root[4:5]:
	for precinct_votes in voter_turnout[0]:
	precinct_name = precinct_votes.attrib['name']
	precinct_vote_items.append({
	'county': county,
	'precinct': precinct_name,
	'office': 'Registered Voters',
	'district': '',
	'party': '',
	'candidate': '',
	'votes': precinct_votes.attrib['totalVoters'],
	'election_day': '',
	'mail': '',
	})
	for precinct_votes in voter_turnout[0]:
	precinct_name = precinct_votes.attrib['name']
	# note that these precinct-vote-count records are not broken down by election_day / mail in this XML input file
	precinct_vote_items.append({
	'county': county,
	'precinct': precinct_name,
	'office': 'Ballots Cast',
	'district': '',
	'party': '',
	'candidate': '',
	'votes': precinct_votes.attrib['ballotsCast'],
	'election_day': '',
	'mail': '',
	})


	# iterate through the contests
	for contest in root[5:]:
	contest_name = contest.attrib['text']
	# standardize office string
	office = get_office(contest_name)
	if not office:
	# skip this iteration unless it is in the list of supported offices
	continue
	if DEBUG:
	print('contest name: ' + contest_name)
	for choice in contest[2:]:
	party = ''
	# robust against missing party attrib in XML tag
	if "party" in choice.attrib:
	party = choice.attrib["party"]
	ticket = ''
	if "text" in choice.attrib:
	ticket = choice.attrib["text"]
	candidate = ticket
	if office == 'President':
	candidate = get_normalized_president_name(ticket)
	elif office == 'U.S. House':
	candidate = get_normalized_congress_house_name(ticket)

	# This dict will be keyed on precinct name and the value will be each row in the final CSV for this candidate
	precincts_of_this_candidate = {}

	if DEBUG:
	print(choice.tag, choice.attrib)

	if DEBUG:
	print(choice[0].tag, choice[0].attrib)
	# traverse the data structure for <VoteType name="Election Day" ...>
	for votetype_election_day in choice[0]:
	if DEBUG:
	print(votetype_election_day.tag, votetype_election_day.attrib)
	precinct_name = votetype_election_day.attrib['name']
	if precinct_name not in precincts_of_this_candidate:
	precincts_of_this_candidate[precinct_name] = {
	'county': county,
	'precinct': precinct_name,
	'office': office,
	'district': get_district(contest_name, office),
	'party': party,
	'candidate': candidate,
	'votes': 0,
	'election_day': 0,
	'mail': 0,
	}
	this_precinct_election_day_votes = int(votetype_election_day.attrib['votes'] if 'votes' in votetype_election_day.attrib else 0)
	precincts_of_this_candidate[precinct_name]['election_day'] = this_precinct_election_day_votes
	precincts_of_this_candidate[precinct_name]['votes'] += this_precinct_election_day_votes

	if DEBUG:
	print(choice[1].tag, choice[1].attrib)
	# traverse the data structure for <VoteType name="Vote By Mail" ...>
	for votetype_mail in choice[1]:
	if DEBUG:
	print(votetype_mail.tag, votetype_mail.attrib)
	precinct_name = votetype_mail.attrib['name']
	if precinct_name not in precincts_of_this_candidate:
	precincts_of_this_candidate[precinct_name] = {
	'county': county,
	'precinct': precinct_name,
	'office': office,
	'district': get_district(contest_name, office),
	'party': party,
	'candidate': candidate,
	'votes': 0,
	'election_day': 0,
	'mail': 0,
	}
	this_precinct_mail_votes = int(votetype_mail.attrib['votes'] if 'votes' in votetype_mail.attrib else 0)
	precincts_of_this_candidate[precinct_name]['mail'] = this_precinct_mail_votes
	precincts_of_this_candidate[precinct_name]['votes'] += this_precinct_mail_votes

	for precinct_name in precincts_of_this_candidate.keys():
	if DEBUG:
	print(json.dumps(precincts_of_this_candidate[precinct_name], indent=4, sort_keys=True))
	# assemble into final array
	precinct_vote_items.append(precincts_of_this_candidate[precinct_name])

	return precinct_vote_items

	def savetoCSV(newsitems, filename):
	# define csv headers/fields
	fields = [
	'county',
	'precinct',
	'office',
	'district',
	'party',
	'candidate',
	'votes',
	'election_day',
	'mail',
	]

	# Write to csv file
	# Python module docs: https://docs.python.org/3/library/csv.html
	with open(filename, 'w') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames = fields, quoting = csv.QUOTE_MINIMAL)
	writer.writeheader()
	writer.writerows(newsitems)


	def main(filename_in, filename_out):
	# parse xml file
	precinct_vote_items = parseXML(filename_in)

	# store news items in a csv file
	savetoCSV(precinct_vote_items, filename_out)

	# Run with CLI:
	# $ python3 ./read_county_xml.py input.xml ./2020/20201103__ca__general__santa_clara__precinct.csv
	if __name__ == "__main__":
	if len(sys.argv) < 3:
	print('bad arguments')
	else:
	filename_in = sys.argv[1]
	filename_out = sys.argv[2]
	main(filename_in, filename_out)