Skip to content

Instantly share code, notes, and snippets.

@myersjustinc
Created November 12, 2012 19:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save myersjustinc/4061370 to your computer and use it in GitHub Desktop.
Save myersjustinc/4061370 to your computer and use it in GitHub Desktop.
AP results data retrieval and processing

We used these two scripts at the PBS NewsHour to update the live election-night results in our Vote 2012 Map Center with data from the Associated Press.

I've removed NewsHour-specific credentials, of course, as well as the part that uploaded JSON files to a PBS FTP server, so these haven't been tested in the form in which they're provided here--but retrieve_data.py downloaded the AP data files periodically from their FTP server to an internal HTTP server for our use, and update_json.py processed the files from that HTTP server into JSONP and pushed them to S3 for consumption by our users.

#!/usr/bin/env python
import errno
import ftplib
import logging
from logging.handlers import DEFAULT_TCP_LOGGING_PORT, SocketHandler
import os
import signal
import sys
FILES_TO_GET = [
"Pres_Reports/flat/pres_electoral.txt",
"AL/flat/AL.txt",
"AK/flat/AK.txt",
"AZ/flat/AZ.txt",
"AR/flat/AR.txt",
"CA/flat/CA.txt",
"CO/flat/CO.txt",
"CT/flat/CT.txt",
"DE/flat/DE.txt",
"DC/flat/DC.txt",
"FL/flat/FL.txt",
"GA/flat/GA.txt",
"HI/flat/HI.txt",
"ID/flat/ID.txt",
"IL/flat/IL.txt",
"IN/flat/IN.txt",
"IA/flat/IA.txt",
"KS/flat/KS.txt",
"KY/flat/KY.txt",
"LA/flat/LA.txt",
"ME/flat/ME.txt",
"MD/flat/MD.txt",
"MA/flat/MA.txt",
"MI/flat/MI.txt",
"MN/flat/MN.txt",
"MS/flat/MS.txt",
"MO/flat/MO.txt",
"MT/flat/MT.txt",
"NE/flat/NE.txt",
"NV/flat/NV.txt",
"NH/flat/NH.txt",
"NJ/flat/NJ.txt",
"NM/flat/NM.txt",
"NY/flat/NY.txt",
"NC/flat/NC.txt",
"ND/flat/ND.txt",
"OH/flat/OH.txt",
"OK/flat/OK.txt",
"OR/flat/OR.txt",
"PA/flat/PA.txt",
"RI/flat/RI.txt",
"SC/flat/SC.txt",
"SD/flat/SD.txt",
"TN/flat/TN.txt",
"TX/flat/TX.txt",
"UT/flat/UT.txt",
"VT/flat/VT.txt",
"VA/flat/VA.txt",
"WA/flat/WA.txt",
"WV/flat/WV.txt",
"WI/flat/WI.txt",
"WY/flat/WY.txt",
]
FTP_HOSTNAME = "electionsonline.ap.org"
# Backup hostname follows:
# FTP_HOSTNAME = "electionsonline2.ap.org"
FTP_USERNAME = "" # Get your own from AP!
FTP_PASSWORD = ""
REMOTE_BASE = "/"
# Set LOCAL_BASE to the directory where the retrieved AP data files should go.
LOCAL_BASE = os.path.join(os.path.dirname(__file__), 'raw')
# Set LOG_BASE to the directory where the lock files (which keep track of other
# copies of this script also running) should go. (Log files also used to go
# here, hence the name.)
LOG_BASE = os.path.join(os.path.dirname(__file__), 'log')
LOCK_FILE_PATH = os.path.join(LOG_BASE, ".ftp.lock")
UNLOCK_WAIT_PATH = os.path.join(LOG_BASE, ".ftp.lock.count")
UNLOCK_WAIT_COUNT = 5
# Set up logging system. Everything's getting logged to an external server.
rootLogger = logging.getLogger('')
rootLogger.setLevel(logging.INFO)
socketHandler = SocketHandler('localhost', DEFAULT_TCP_LOGGING_PORT)
rootLogger.addHandler(socketHandler)
def get_ap_files():
logging.info("Starting retrieval job.")
# Open one connection to the AP FTP server. We'll reuse this connection for
# all files in this run a) to follow AP's instructions on the matter and b)
# to avoid the extra time and connection overhead of disconnecting and
# reconnecting for every file.
_ftp = ftplib.FTP(FTP_HOSTNAME, FTP_USERNAME, FTP_PASSWORD)
# Retrieve all of the files listed in FILES_TO_GET.
for filename in FILES_TO_GET:
local_path = os.path.join(LOCAL_BASE, filename)
try:
os.makedirs(os.path.dirname(local_path))
except OSError:
pass
local_file = open(local_path, 'wb')
try:
_ftp.retrbinary(
'RETR %s%s' % (REMOTE_BASE, filename),
local_file.write)
logging.debug("Successfully retrieved %s" % filename)
except ftplib.all_errors, e:
logging.error(
"Error when retrieving %s: %s" % (
filename, e.message))
local_file.close()
# Once we're all done, close the connection to the FTP server.
_ftp.close()
logging.info("Complete.")
# Thanks: http://code.activestate.com/recipes/
# 578022-wait-for-pid-and-check-for-pid-existance-posix/
def pid_exists(pid):
"""Check whether pid exists in the current process table."""
if pid < 0:
return False
try:
os.kill(pid, 0)
except OSError, e:
return e.errno == errno.EPERM
else:
return True
if __name__ == '__main__':
# Run the retrieval function, of course, but let's use a lock file to make
# sure we aren't already running. (If the FTP server's being a bit slow,
# the solution probably isn't to inadvertently have several instances of
# our script hitting it simultaneously just because cron called it again.)
#
# Lock file implementation modified from here:
# http://shoaibmir.wordpress.com/2009/12/14/pid-lock-file-in-python/
if os.access(LOCK_FILE_PATH, os.F_OK):
# The lock file exists. It should contain the process ID number, so
# let's read that...
lock_file = open(LOCK_FILE_PATH, 'r')
lock_file.seek(0)
old_pid = lock_file.readline()
lock_file.close()
# ...and see if a process with that ID exists. If so, exit with an
# error message. If not, remove the lock file and continue on our
# merry way.
if pid_exists(int(old_pid)):
# This error leads to some weird errors itself since we'd have two
# processes writing to the same file. We should do something about
# this.
logging.error(
"This script is already running with process ID " +
old_pid + ".")
# Before we exit, let's see how many times this has happened.
unlock_count = 0
try:
unlock_file = open(UNLOCK_WAIT_PATH, 'r')
unlock_file.seek(0)
unlock_count = int(unlock_file.readline())
unlock_file.close()
except (IOError, ValueError):
pass
# If we've had this problem fewer than UNLOCK_WAIT_COUNT times,
# increase the counter and get out of here. If not, kill the
# offending process, reset the counter, remove the lock file and
# get out of here anyway to avoid getting off schedule.
if unlock_count < UNLOCK_WAIT_COUNT:
logging.error("On wait %s of %s before killing. Exiting." % (
unlock_count, UNLOCK_WAIT_COUNT))
unlock_file = open(UNLOCK_WAIT_PATH, 'w')
unlock_file.write(str(unlock_count + 1))
unlock_file.close()
sys.exit(1)
else:
logging.error("Done waiting. Killing old process.")
os.kill(int(old_pid), signal.SIGKILL)
unlock_file = open(UNLOCK_WAIT_PATH, 'w')
unlock_file.write(str(0))
unlock_file.close()
sys.exit(2)
else:
logging.debug(
"Lock file exists, but process " + old_pid +
" doesn't exist. Removing.")
os.remove(LOCK_FILE_PATH)
# Now that we've checked whether the lock file exists (and removed it if
# it's safe to do so), let's create a new lock file...
lock_file = open(LOCK_FILE_PATH, 'w')
lock_file.write(str(os.getpid()))
lock_file.close()
# ...reset the unlock counter...
unlock_file = open(UNLOCK_WAIT_PATH, 'w')
unlock_file.write(str(0))
unlock_file.close()
# ...and run the retrieval function.
get_ap_files()
# When we're done, remove the lock file.
os.remove(LOCK_FILE_PATH)
#!/usr/bin/env python
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from datetime import date, datetime
import errno
from io import StringIO
from itertools import izip
import json
import logging
from logging.handlers import DEFAULT_TCP_LOGGING_PORT, SocketHandler
from operator import attrgetter
import os
from pytz import timezone
import requests
import signal
import sys
from time import sleep
import zipfile
# Configuration options
RACE_NAMES_TO_INCLUDE = (
"President",
"U.S. Senate",
"U.S. House - District 1",
"U.S. House - District 2",
"U.S. House - District 3",
"U.S. House - District 4",
"U.S. House - District 5",
"U.S. House - District 6",
"U.S. House - District 7",
"U.S. House - District 8",
"U.S. House - District 9",
"U.S. House - District 10",
"U.S. House - District 11",
"U.S. House - District 12",
"U.S. House - District 13",
"U.S. House - District 14",
"U.S. House - District 15",
"U.S. House - District 16",
"U.S. House - District 17",
"U.S. House - District 18",
"U.S. House - District 19",
"U.S. House - District 20",
"U.S. House - District 21",
"U.S. House - District 22",
"U.S. House - District 23",
"U.S. House - District 24",
"U.S. House - District 25",
"U.S. House - District 26",
"U.S. House - District 27",
"U.S. House - District 28",
"U.S. House - District 29",
"U.S. House - District 30",
"U.S. House - District 31",
"U.S. House - District 32",
"U.S. House - District 33",
"U.S. House - District 34",
"U.S. House - District 35",
"U.S. House - District 36",
"U.S. House - District 37",
"U.S. House - District 38",
"U.S. House - District 39",
"U.S. House - District 40",
"U.S. House - District 41",
"U.S. House - District 42",
"U.S. House - District 43",
"U.S. House - District 44",
"U.S. House - District 45",
"U.S. House - District 46",
"U.S. House - District 47",
"U.S. House - District 48",
"U.S. House - District 49",
"U.S. House - District 50",
"U.S. House - District 51",
"U.S. House - District 52",
"U.S. House - District 53",
"Governor",
"Lieutenant Governor",
"Attorney General",
"Sheriff - Maricopa"
"U.S. Delegate",
"Treasurer",
"Auditor",
"Auditor General",
"Secretary of State",
"U.S. Senate - (2006)",
"U.S. Senate - 2012",
"Referendum - 74 - Same-Sex Marriage", # Washington
"Question - 6 - Allow Same Sex Marriage", # Maryland
"Question - 1 - Yes Same Sex Mrg", # Maine
"Amendment - 1 - No Same Sex Marriage", # Minnesota
"Issue - 5 - Medical Marijuana", # Arkansas
"Amendment - 64 - Legalize Marijuana", # Colorado
"Question - 3 - Yes Medical Marijuana", # Massachusetts
"Referendum - 124 - Reform Medical Marijuana", # Montana
"Measure - 80 - Legalize Marijuana", # Oregon
"Initiative - 502 - Legalize Marijuana", # Washington
"Amendment - 6 - No Mandatory Health Care", # Alabama
"Amendment - 1 - No Mandatory Health Coverage", # Florida
"Proposition - E - Prohibit Health Exchange", # Missouri
"Referendum - 122 - No Mandatory Health Ins", # Montana
"Amendment - A - Health Services Rights", # Wyoming
)
TIME_ZONE_TO_USE = "America/New_York"
TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S"
# If you're running parts of this script in such a way as to pull AP data from
# the local filesystem instead of a remote HTTP server, set
# FLAT_FILE_LOCAL_ROOT to the directory where those files are located.
FLAT_FILE_LOCAL_ROOT = ""
# If you're running this script in the normal way (pulling AP data from an HTTP
# server), though, set FLAT_FILE_REMOTE_ROOT to the HTTP path where those files
# are located.
FLAT_FILE_REMOTE_ROOT = ""
FLAT_FILE_LOCAL = FLAT_FILE_LOCAL_ROOT + "%(state)s/flat/%(state)s.txt"
FLAT_FILE_REMOTE = FLAT_FILE_REMOTE_ROOT + "%(state)s/flat/%(state)s.txt"
# If there's a problem processing the AP files, try again every RETRY_DELAY
# seconds up to RETRY_ATTEMPTS times.
RETRY_ATTEMPTS = 3
RETRY_DELAY = 1
# Copies of all of the raw AP files and the generated JSONP files are stored in
# zip files in case they come in handy later. These are stored at RAW_ZIP_PATH
# and JSONP_ZIP_PATH.
ZIP_ROOT = ""
RAW_ZIP_PATH = ZIP_ROOT + "raw.zip"
JSONP_ZIP_PATH = ZIP_ROOT + "jsonp.zip"
# Once the JSONP files are generated, they are uploaded to the S3 bucket
# specified in S3_UPLOAD_BUCKET (at the path in S3_UPLOAD_PATH) using the given
# Amazon access credentials.
AMAZON_ACCESS_KEY_ID = ""
AMAZON_SECRET_ACCESS_KEY = ""
S3_UPLOAD_BUCKET = ""
S3_UPLOAD_PATH = ""
LOG_BASE = ZIP_ROOT
LOCK_FILE_PATH = LOG_BASE % ".update.lock"
UNLOCK_WAIT_PATH = LOG_BASE % ".update.lock.count"
UNLOCK_WAIT_COUNT = 3
# Set up logging system. Everything's getting logged to an external server.
rootLogger = logging.getLogger('')
rootLogger.setLevel(logging.INFO)
socketHandler = SocketHandler('localhost', DEFAULT_TCP_LOGGING_PORT)
rootLogger.addHandler(socketHandler)
# Hide INFO-level warnings from python-requests
requests_log = logging.getLogger("requests")
requests_log.setLevel(logging.WARNING)
# Useful constants
STATE_TO_FIPS = {
'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', 'CO': '08',
'CT': '09', 'DE': '10', 'DC': '11', 'FL': '12', 'GA': '13', 'HI': '15',
'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19', 'KS': '20', 'KY': '21',
'LA': '22', 'ME': '23', 'MD': '24', 'MA': '25', 'MI': '26', 'MN': '27',
'MS': '28', 'MO': '29', 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33',
'NJ': '34', 'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39',
'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45', 'SD': '46',
'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', 'VA': '51', 'WA': '53',
'WV': '54', 'WI': '55', 'WY': '56'
}
USPS_TO_STATE = {
'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut',
'DE': 'Delaware', 'DC': 'District of Columbia', 'FL': 'Florida',
'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois',
'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky',
'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts',
'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri',
'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire',
'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio',
'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania',
'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota',
'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont',
'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
'WI': 'Wisconsin', 'WY': 'Wyoming'
}
PARTY_CHOICES = {
'ACP': "A Connecticut Party",
'AFC': "American First Coalition",
'AHP': "American Heritage Party",
'AIP': "American Independent Pty",
'AKI': "Alaskan Independence",
'AmC': "American Constitution",
'AmD': "American Dream",
'AmF': "America First",
'Amr': "American",
'BLD': "Builders Party",
'BSc': "Better Schools",
'Bst': "Best",
'Buc': "Buchanan Reform",
'CC': "Concerned Citizens",
'CEN': "Centrist Party",
'CF': "Citizens First",
'CM': "Cool Moose",
'CNT': "Camp. for a New Tomorrow",
'CST': "Constitution",
'CTL': "CT for Lieberman",
'CnP': "Concerns of People",
'Cnl': "Constitutional",
'Con': "Conservative",
'Crk': "Politicians are Crooks",
'DCG': "DC Statehood Green Party",
'Dem': "Democrat",
'ERP': "Economic Recovery Party",
'ESL': "End Suffolk Legislature",
'FAA': "Fair",
'FEP': "Free Energy Party",
'FIn': "Fusion Independent",
'FNP': "Future Now Party",
'FSB': "Farmers & Small Business",
'FSO': "Freedom Socialist",
'FVP': "Family Values Party",
'FrU': "Friends United",
'Fre': "Freedom",
'GCP': "Green Coalition Party",
'GNW': "Greens No To War",
'GOP': "Republican",
'GRP': "Grass Roots Party",
'Grn': "Green",
'HCP': "Healthcare Party",
'HP': "Home Protection",
'HWP': "Harold Washington Party",
'IAP': "Independent American",
'ICP': "Ind. Christian Profile",
'IER': "Ind. for Econ. Recovery",
'IF': "Independent Fusion",
'IG': "Independent Grassroots",
'INT': "Integrity Party",
'IP': "Independent Party",
'IPC': "Independent Peoples Coal",
'IPU': "Independent Party of UT",
'IPr': "Independent-Progressive",
'ISC': "Ind. Save Our Children",
'IVP': "Independent Voters",
'Ind': "Independent",
'Inp': "Independence",
'JP': "Justice Party",
'JPR': "Jobs Property Rights",
'LBP': "Looking Back Party",
'LFM': "Labor and Farm",
'LIF': "Long Island First",
'LMj': "Legalize Marijuana",
'LUP': "Liberty Union/Progressiv",
'LUn': "Liberty Union",
'Lbl': "Liberal",
'Lib': "Libertarian",
'MCA': "Moderate Citizens Accnt.",
'MJP': "Marijuana Party",
'MML': "Make Marijuana Legal",
'MRP': "Marijuana Reform Party",
'MTp': "Mississippi Taxpayers",
'Mnt': "Mountain Party",
'NAL': "New Alliance",
'NHT': "No Home Heat Tax",
'NJC': "New Jersey Conservative",
'NJI': "New Jersey Independents",
'NLP': "Natural Law Party",
'NNT': "No New Taxes",
'NP': "Non-Partisan",
'NPD': "No Party Designation",
'NPP': "New Progressive Party",
'NRA': "Nutritional Rights Allnc",
'Neb': "Nebraska",
'NtL': "National Labor Party",
'NwP': "New Perspective",
'OTE': "128 District",
'OTx': "Orange Taxpayers",
'One': "One Earth",
'Opn': "Open",
'Oth': "Other",
'PAG': "Pacific Green",
'PBl': "Pro-Bethel",
'PCH': "Personal Choice",
'PEC': "Petitioning Candidate",
'PET': "Party of Ethics & Tradit",
'PFP': "Peace and Freedom",
'PJP': "Peace and Justice",
'PLC': "Pro Life Conservative",
'PRT': "Preserve Our Town",
'PTC': "Property Tax Cut",
'PV': "People of Vermont",
'PWF': "Protect Working Families",
'Pac': "Pacific",
'Pat': "Patriot Party",
'Pop': "Populist",
'PrF': "Protecting Freedom",
'PrL': "Pro Life",
'Prg': "Progressive",
'Pro': "Prohibition",
'RFC': "Randolph for Congress",
'RJF': "Restore Justice-Freedom",
'RM': "Reform Minnesota",
'ROP': "Running on Principles",
'RP': "Reform Party",
'RTL': "Right to Life",
'Res': "Resource Party",
'RpM': "Republican Moderate",
'SEP': "Socialist Equality",
'SM': "Save Medicare",
'SPU': "Socialist USA",
'SSS': "Save Social Security",
'STC': "Star Tax Cut",
'SWP': "Socialist Workers Party",
'ScC': "School Choice",
'Sen': "Save Seniors",
'Soc': "Socialist",
'StF': "Student First",
'Sta': "Statehood Party",
'TAP': "The American Party",
'TBL': "The Better Life",
'TC': "Tax Cut",
'TCN': "Tax Cut Now",
'TGo': "The Go",
'TLL': "Truth Life Liberty",
'TLm': "Term Limits",
'TS': "Timesizing",
'TTP': "The 3rd Party",
'UAd': "United Advocacy",
'UCz': "United Citizen",
'UST': "U.S. Taxpayers Party",
'Una': "Unaffiliated",
'Und': "Undauntable Stalwart All",
'Unr': "Unenrolled",
'Uty': "Unity",
'VET': "Veterans Party",
'VG': "Vermont Grassroots",
'VOP': "Voice of the People",
'VRP': "Voters Rights Party",
'WCP': "Working Class Party",
'WF': "Working Families",
'WSN': "West Side Neighbors",
'WTP': "We the People",
'WV': "Workers for Vermont",
'WW': "Workers World",
'WiT': "Wisconsin Taxpayers Pty"
}
OFFICE_CHOICES = {
'P': "President",
'S': "U.S. Senate",
'H': "U.S. House",
'G': "Governor",
'A': "Attorney General",
'C': "Controller/Comptroller",
'D': "District Attorney",
'E': "Education Commissioner",
'F': "Superintendent of Public Instruction",
'I': (
"Amendment/Initiative/Proposal/Proposition/Question/Referendum/"
"Ballot Measure"),
'J': "Insurance Commissioner",
'L': "Lieutenant Governor",
'M': "Mayor",
'N': "City Council",
'R': "Secretary of State",
'T': "Treasurer",
'Y': "State House/State Assembly/General Assembly/House of Delegates",
'Z': "State Senate",
# NOTE: Page 121 of the APEO docs says, "The lower-on-the-ticket office IDs
# vary too greatly from state to state to list."
}
RACE_TYPE_CHOICES = {
'D': "Democratic primary",
'R': "Republican primary",
'G': "General election",
'E': "Democratic caucus",
'S': "Republican caucus",
# NOTE: Page 18 of the APEO docs cites "other characters used, depending
# on the state and local races."
}
# Classes of object to store raw AP data
class Race(object):
def __init__(
self, test, election_date, state_postal, county_number, fips_code,
county_name, race_number, office_id, race_type_id, seat_number,
office_name, seat_name, race_type_party, race_type,
office_description, number_of_winners, number_in_runoff,
precincts_reporting, total_precincts):
if test == 'l' or test is False:
self.test = False
else:
self.test = True
if isinstance(election_date, date):
self.election_date = election_date
else:
self.election_date = date(
*[int(x) for x in election_date.split('-')])
self.state_postal = state_postal
self.county_number = county_number
# While we're here, let's also create the area ID we'll use more
# frequently. This is just the county FIPS code if applicable or the
# state's common English name if we're looking at statewide results.
if fips_code == '0':
self.fips_code = STATE_TO_FIPS[state_postal.upper()] + '000'
self.area_id = USPS_TO_STATE[state_postal]
else:
# This is the county's FIPS code, even for New England states where
# the results are by city/town.
self.fips_code = fips_code.rjust(5, '0')
self.area_id = self.fips_code
self.county_name = county_name
self.race_number = race_number
self.office_id = office_id
self.race_type_id = race_type_id
self.seat_number = seat_number
self.office_name = office_name
self.seat_name = seat_name
self.race_type_party = race_type_party
self.race_type = race_type
self.office_description = office_description
self.number_of_winners = int(number_of_winners)
self.number_in_runoff = int(number_in_runoff)
self.precincts_reporting = int(precincts_reporting)
self.total_precincts = int(total_precincts)
# Create stub relation.
self._candidates = {}
# Create standardized race name.
race_name_components = [self.office_name]
if self.seat_name:
race_name_components.append(self.seat_name)
if self.race_type_party:
race_name_components.append(self.race_type_party)
self.race_name = ' - '.join(race_name_components)
def __repr__(self):
return "Race(%s)" % ", ".join([repr(x) for x in [
self.test, self.election_date, self.state_postal,
self.county_number, self.fips_code, self.county_name,
self.race_number, self.office_id, self.race_type_id,
self.seat_number, self.office_name, self.seat_name,
self.race_type_party, self.race_type, self.office_description,
self.number_of_winners, self.number_in_runoff,
self.precincts_reporting, self.total_precincts]])
def __str__(self):
return "<Race (%s): %s in %s on %s--%s/%s reporting>" % (
("test" if self.test else "live"), self.seat_name,
self.county_name, self.election_date.strftime("%m/%d/%Y"),
self.precincts_reporting, self.total_precincts)
def get_winner(self):
for candidate in self._candidates.values():
if candidate.winner:
return candidate
def get_total_votes(self):
total_votes = 0
for candidate in self._candidates.values():
total_votes += candidate.vote_count
return total_votes
def add_candidate(self, candidate):
self._candidates[candidate.candidate_number] = candidate
return candidate
def get_candidate(self, candidate_number):
return self._candidates[candidate_number]
def get_candidates(self, sort_result=False):
candidates = self._candidates.values()
if sort_result:
sorted_results = sorted(
candidates,
key=attrgetter('vote_count', 'last_name'),
reverse=True)
if sorted_results and sorted_results[0].vote_count == 0:
sorted_results = []
for candidate in candidates:
if candidate.party == 'Dem' or candidate.party == 'GOP':
sorted_results.append(candidate)
for candidate in candidates:
if candidate.party != 'Dem' and candidate.party != 'GOP':
sorted_results.append(candidate)
return sorted_results
else:
return candidates
class Candidate(object):
def __init__(
self, candidate_number, order, party, first_name, middle_name,
last_name, junior, use_junior, incumbent, vote_count, winner,
npid):
self.candidate_number = candidate_number
self.order = int(order)
self.party = party
self.first_name = first_name
self.middle_name = middle_name
self.last_name = last_name
self.junior = junior
if use_junior == '1' or use_junior is True:
self.use_junior = True
else:
self.use_junior = False
if incumbent == '1' or incumbent is True:
self.incumbent = True
else:
self.incumbent = False
self.vote_count = int(vote_count)
if winner == 'X' or winner is True:
self.winner = True
else:
self.winner = False
self.npid = npid
# Create stub relation.
self.race = None
def __repr__(self):
return "Candidate(%s)" % ", ".join([repr(x) for x in [
self.candidate_number, self.order, self.party, self.first_name,
self.middle_name, self.last_name, self.junior, self.use_junior,
self.incumbent, self.vote_count, self.winner, self.npid]])
def __str__(self):
return "<Candidate: %s (%s%s)>" % (
self.get_name(), self.party, (" (won)" if self.winner else ""))
def get_name(self):
name_parts = [self.first_name]
if self.middle_name:
name_parts.append(self.middle_name)
name_parts.append(self.last_name)
if self.use_junior:
name_parts.append(self.junior)
return ' '.join(name_parts).strip()
def get_vote_percent(self):
if self.race:
return 100 * float(self.vote_count) / self.race.get_total_votes()
else:
raise ValueError("Candidate has no race")
_electoral_states = {}
class ElectoralState(object):
def __init__(self, state_postal):
self.state_postal = state_postal
if state_postal in USPS_TO_STATE:
self.state_name = USPS_TO_STATE[state_postal]
else:
self.state_name = "United States"
self._candidates = {}
_electoral_states[state_postal] = self
def __repr__(self):
return "ElectoralState(%s)" % repr(self.state_postal)
def __str__(self):
return "<ElectoralState: %s>" % self.state_name
def get_winner(self):
for candidate in self._candidates.values():
if candidate.winner:
return candidate
def get_total_votes(self):
total_votes = 0
for candidate in self._candidates.values():
total_votes += candidate.vote_count
return total_votes
def add_candidate(self, candidate):
self._candidates[candidate.candidate_number] = candidate
return candidate
def get_candidate(self, candidate_number):
return self._candidates[int(candidate_number)]
def get_candidates(self, sort_result=False):
candidates = self._candidates.values()
if sort_result:
sorted_results = sorted(
candidates,
key=attrgetter('electoral_votes', 'vote_count', 'last_name'),
reverse=True)
if sorted_results and sorted_results[0].vote_count == 0:
sorted_results = []
for candidate in candidates:
if (
candidate.last_name == 'Obama' or
candidate.last_name == 'Romney'):
sorted_results.append(candidate)
for candidate in candidates:
if (
candidate.last_name != 'Obama' and
candidate.last_name != 'Romney'):
sorted_results.append(candidate)
return sorted_results
else:
return candidates
class ElectoralCandidate(object):
def __init__(
self, test, election_date, state_postal, candidate_number, npid,
electoral_votes, vote_count, winner, precincts_reporting,
total_precincts, last_name, state_electoral_votes):
if test == 'l' or test is False:
self.test = False
else:
self.test = True
if isinstance(election_date, date):
self.election_date = election_date
else:
self.election_date = date(
*[int(x) for x in election_date.split('-')])
self.state_postal = state_postal
self.candidate_number = candidate_number
self.npid = npid
self.electoral_votes = int(electoral_votes)
self.vote_count = int(vote_count)
if winner == 'X' or winner is True:
self.winner = True
else:
self.winner = False
self.precincts_reporting = int(precincts_reporting)
self.total_precincts = int(total_precincts)
self.last_name = last_name
self.state_electoral_votes = int(state_electoral_votes)
if state_postal not in _electoral_states:
_electoral_states[state_postal] = ElectoralState(state_postal)
self.state = _electoral_states[state_postal]
self.state.add_candidate(self)
if (self.state_postal != 'US' and self.electoral_votes > 0 and
self.electoral_votes != self.state_electoral_votes):
logging.warning((
"%s has electoral vote mismatch for %s: %s allocated,"
"%s possible") % (
self.state_postal, self.last_name, self.electoral_votes,
self.state_electoral_votes))
def __repr__(self):
return "ElectoralCandidate(%s)" % ", ".join([repr(x) for x in [
self.test, self.election_date, self.state_postal,
self.candidate_number, self.npid, self.electoral_votes,
self.vote_count, self.winner, self.precincts_reporting,
self.total_precincts, self.last_name, self.state_electoral_votes]])
def __str__(self):
return "<ElectoralCandidate: %s%s>" % (
self.last_name, (" (won)" if self.winner else ""))
def get_vote_percent(self):
if self.state:
return 100 * float(self.vote_count) / self.state.get_total_votes()
else:
raise ValueError("Candidate has no state")
# AP data processing
def parse_row(row_text):
row_parts = row_text.split(';')[:-1]
race = Race(*row_parts[:19])
candidates = [
Candidate(*candidate_parts)
for candidate_parts in izip(*[iter(row_parts[19:])]*12)]
for candidate in candidates:
candidate.race = race
race.add_candidate(candidate)
return race
def parse_file(file_path):
# Get ready to read the file. If there's a newline in file_path, then we've
# been passed the actual contents instead of just the file's path, so we
# should create a StringIO object that we can handle as if it were a file.
# Otherwise, of course, just open the specified file for reading.
if '\n' in file_path:
input_file = StringIO()
input_file.write(file_path)
input_file.seek(0)
else:
input_file = open(file_path, 'r')
races = {}
race_names = {}
# Parse each of the records in the file.
for line in input_file:
race = parse_row(line)
if (RACE_NAMES_TO_INCLUDE and
race.race_name not in RACE_NAMES_TO_INCLUDE):
continue
if race.race_number not in races:
races[race.race_number] = {}
race_names[race.race_number] = race.race_name
if race.area_id not in races[race.race_number]:
races[race.race_number][race.area_id] = race
else:
# There are New England states where AP doesn't report county-level
# results because it instead reports by cities and towns. If we
# find that this race already has results stored for this county,
# we need to incorporate these new results into the existing ones.
# In practice, this just means adding the precincts numbers and
# the candidates' respective vote counts; we don't use the other
# information, for the most part.
old_race = races[race.race_number][race.area_id]
# Add this race's information to that of the existing race for
# this area.
old_race.precincts_reporting += race.precincts_reporting
old_race.total_precincts += race.total_precincts
# Add each of the candidates' vote totals to those already stored
# for this area.
for candidate in race.get_candidates():
try:
old_candidate = old_race.get_candidate(
candidate.candidate_number)
old_candidate.vote_count += candidate.vote_count
except KeyError:
race.add_candidate(candidate)
# Close the input file, whatever type it might be.
input_file.close()
return (races, race_names)
def parse_electoral_file(file_path):
# Get ready to read the file. If there's a newline in file_path, then we've
# been passed the actual contents instead of just the file's path, so we
# should create a StringIO object that we can handle as if it were a file.
# Otherwise, of course, just open the specified file for reading.
if '\n' in file_path:
input_file = StringIO()
input_file.write(unicode(file_path))
input_file.seek(0)
else:
input_file = open(file_path, 'r')
states = {}
# Parse each of the records in the file.
for line in input_file:
candidate = ElectoralCandidate(*line.split(';')[:-1])
if candidate.state.state_name not in states:
states[candidate.state.state_name] = candidate.state
# Close the input file, whatever type it might be.
input_file.close()
return states
# Turning AP files into Map Center objects
def create_results_dict(races, race_names, state, state_name, last_updated):
output_dict = {
"candidates": {},
"lastUpdated": [
last_updated.year, last_updated.month, last_updated.day,
last_updated.hour, last_updated.minute],
"parties": {},
"raceNames": race_names,
"races": {},
"test": False,
}
# Process the races themselves.
for race_number, race_areas in races.iteritems():
race_data = {
"areas": {},
"breakdown": [],
"precincts": [],
"winners": {},
}
for area_id, race in race_areas.iteritems():
area_data = {
"data": [],
"precincts": [race.precincts_reporting, race.total_precincts],
}
candidates = race.get_candidates(sort_result=True)
for candidate in candidates:
output_dict["candidates"][
candidate.candidate_number] = candidate.get_name()
output_dict["parties"][
candidate.candidate_number] = candidate.party
area_data["data"].append([
candidate.candidate_number, candidate.vote_count])
race_data["areas"][area_id] = area_data
race_winner = race.get_winner()
if race_winner is None:
race_data["winners"][area_id] = None
else:
race_data["winners"][area_id] = race_winner.candidate_number
if race.test:
output_dict["test"] = True
# Copy statewide numbers into breakdown and precincts for easier
# retrieval.
race_data["breakdown"] = race_data["areas"][state_name]["data"]
race_data["precincts"] = race_data["areas"][state_name]["precincts"]
output_dict["races"][race_number] = race_data
return output_dict
def create_nationwide_dict(state_dicts, last_updated):
output_dict = {
"areas": {},
"lastUpdated": [
last_updated.year, last_updated.month, last_updated.day,
last_updated.hour, last_updated.minute],
"parties": {},
"test": False,
}
for state_name, state_dict in state_dicts.iteritems():
if state_dict["test"]:
output_dict["test"] = True
output_dict["parties"][state_name] = dict([
(state_dict["candidates"][x[0]], x[1]) for x in
state_dict["parties"].iteritems()])
state_summary = {}
for race_id, race_name in state_dict["raceNames"].iteritems():
state_summary[race_name] = {
"breakdown": [
[state_dict["candidates"][x[0]], x[1]]
for x in state_dict["races"][race_id]["breakdown"]],
"precincts": state_dict["races"][race_id]["precincts"],
}
winner_id = state_dict["races"][race_id]["winners"][state_name]
if winner_id is None:
state_summary[race_name]["winner"] = None
else:
state_summary[race_name]["winner"] = state_dict["candidates"][
winner_id]
output_dict["areas"][state_name] = state_summary
return output_dict
def create_electoral_dict(ec_contents, last_updated):
output_dict = {
"areas": {},
"lastUpdated": [
last_updated.year, last_updated.month, last_updated.day,
last_updated.hour, last_updated.minute],
"test": False,
}
states = parse_electoral_file(ec_contents)
for state_name, state in states.iteritems():
state_dict = {
"breakdown": [],
"winner": None,
"precincts": []
}
for candidate in state.get_candidates(True):
if candidate.test:
output_dict["test"] = True
state_dict["precincts"] = [
candidate.precincts_reporting, candidate.total_precincts]
state_dict["breakdown"].append([
candidate.last_name, candidate.vote_count,
candidate.electoral_votes])
if candidate.winner:
state_dict["winner"] = candidate.last_name
output_dict["areas"][state_name] = state_dict
return output_dict
# Retrieving raw data, uploading processed data and archiving both
def process_states(states=[], download=True, upload=True, timestamp='latest'):
# Get current date and time.
if timestamp == 'latest':
last_updated = timezone(TIME_ZONE_TO_USE).localize(datetime.now())
file_timestamp = last_updated.strftime(TIMESTAMP_FORMAT)
logging.info("Current timestamp is %s" % file_timestamp)
else:
last_updated = timezone(TIME_ZONE_TO_USE).localize(datetime.strptime(
timestamp, TIMESTAMP_FORMAT))
file_timestamp = timestamp
logging.info("Using provided timestamp %s" % file_timestamp)
# Open the archive storage.
jsonp_zip = zipfile.ZipFile(JSONP_ZIP_PATH, 'a', zipfile.ZIP_DEFLATED)
if download:
raw_zip = zipfile.ZipFile(RAW_ZIP_PATH, 'a', zipfile.ZIP_DEFLATED)
elif timestamp != 'latest':
raw_zip = zipfile.ZipFile(RAW_ZIP_PATH, 'r', zipfile.ZIP_DEFLATED)
# Hang onto the files here for later FTP use.
to_upload_str = {}
all_dicts = {}
# Retrieve and parse the electoral data.
if timestamp != 'latest':
logging.debug("Reading archived data")
ec_contents = raw_zip.read("pres_electoral-%s.txt" % file_timestamp)
electoral_dict = create_electoral_dict(ec_contents, last_updated)
elif download:
logging.debug("Downloading raw data")
attempts = 0
while attempts < RETRY_ATTEMPTS:
r = requests.get(
FLAT_FILE_REMOTE_ROOT + "Pres_Reports/flat/pres_electoral.txt")
ec_contents = r.text
try:
electoral_dict = create_electoral_dict(
ec_contents, last_updated)
break
except Exception, e:
attempts += 1
logging.warning(
"Error parsing Electoral College: %s" % e.message)
sleep(RETRY_DELAY)
continue
if attempts >= RETRY_ATTEMPTS:
raise Exception("Out of retries for Electoral College")
raw_zip.writestr('pres_electoral-%s.txt' % file_timestamp, ec_contents)
else:
logging.debug("Reading test data")
ec_flat_file = open(
FLAT_FILE_LOCAL_ROOT + "Pres_Reports/flat/pres_electoral.txt", 'r')
ec_contents = ec_flat_file.read()
ec_flat_file.close()
electoral_dict = create_electoral_dict(ec_contents, last_updated)
# Generate the electoral votes file.
logging.debug("Generating electoral dict")
if upload:
logging.debug("Rendering JSONP")
ec_jsonp = "US(%s)" % json.dumps(electoral_dict)
jsonp_zip.writestr("us_electoral-%s.json" % file_timestamp, ec_jsonp)
to_upload_str["us_electoral.json"] = ec_jsonp
# Generate each of the state results files.
for state_postal in states:
# Normalize state name and abbreviation.
state = state_postal.upper()
state_name = USPS_TO_STATE[state]
logging.debug("Starting %s" % state_name)
# Retrieve and parse the state's data.
if timestamp != 'latest':
logging.debug("Reading archived data")
file_contents = raw_zip.read("%s-%s.txt" % (state, file_timestamp))
races, race_names = parse_file(unicode(file_contents))
elif download:
logging.debug("Downloading raw data")
races = {}
race_names = {}
attempts = 0
while attempts < RETRY_ATTEMPTS:
r = requests.get(FLAT_FILE_REMOTE % {"state": state})
file_contents = r.text
try:
races, race_names = parse_file(file_contents)
break
except Exception, e:
attempts += 1
logging.warning(
"Error parsing state %s: %s" % (state_name, e.message))
sleep(RETRY_DELAY)
continue
if attempts >= RETRY_ATTEMPTS:
raise Exception("Out of retries for %s" % state_name)
raw_zip.writestr(
'%s-%s.txt' % (state, file_timestamp), file_contents)
else:
logging.debug("Reading test data")
races, race_names = parse_file(FLAT_FILE_LOCAL % {"state": state})
# Prepare the results object.
logging.debug("Generating results dict")
output_dict = create_results_dict(
races, race_names, state, state_name, last_updated)
all_dicts[state_name] = output_dict
# Replace the statewide presidential results with those from the
# national electoral file for consistency.
#
# First, find the number for the presidential race.
pres_race_number = None
for race_number, race_name in output_dict["raceNames"].iteritems():
if race_name == "President":
pres_race_number = race_number
break
if pres_race_number:
state_flat = output_dict["races"][pres_race_number]
state_flat_area = state_flat["areas"][state_name]
state_electoral = electoral_dict["areas"][state_name]
# Find the candidate numbers for each candidate described in the
# electoral file's breakdown.
candidate_numbers = {}
candidate_names = dict([
(output_dict["candidates"][x[0]], x[0])
for x in state_flat_area["data"]])
for i in xrange(len(state_electoral["breakdown"])):
cand_electoral = state_electoral["breakdown"][i]
cand_last_name = cand_electoral[0]
cand_popular = state_flat_area["data"][i]
cand_full_name = output_dict["candidates"][cand_popular[0]]
if (cand_electoral[1] == cand_popular[1] or
cand_full_name.endswith(cand_last_name)):
# This should be the same person.
candidate_numbers[cand_last_name] = cand_popular[0]
else:
# We're guessing here, but we'll go with someone out of
# this state's presidential candidates who has the same
# last name. Because of cases where AP does really silly
# things (such as the candidate "Gloria La Riva" being
# referred to as "LaRiva" (no space) in the electoral
# file), we normalize the names by lowercasing them and
# removing spaces. This should also help with the last name
# "Noneofthesecandidates" in Nevada.
for cand_full_name in candidate_names:
if cand_full_name.lower().replace(' ', '').endswith(
cand_last_name.lower().replace(' ', '')):
candidate_numbers[cand_last_name] = (
candidate_names[cand_full_name])
break
# Rewrite the breakdown.
new_breakdown = [
[candidate_numbers[x[0]], x[1]]
for x in state_electoral["breakdown"]]
# Replace the actual dict contents.
state_flat_area["precincts"] = state_electoral["precincts"]
state_flat_area["data"] = new_breakdown
state_flat["precincts"] = state_flat_area["precincts"]
state_flat["breakdown"] = state_flat_area["data"]
if state_electoral["winner"] is not None:
state_flat["winners"][state_name] = candidate_numbers[
state_electoral["winner"]]
else:
state_flat["winners"][state_name] = None
# Generate, archive and queue the JSONP for upload if needed.
if upload:
logging.debug("Rendering JSONP")
jsonp = "%s(%s)" % (state, json.dumps(output_dict))
jsonp_filename = '%s_general.json' % state.lower()
jsonp_zip_filename = '%s_general-%s.json' % (
state.lower(), file_timestamp)
jsonp_zip.writestr(jsonp_zip_filename, jsonp)
to_upload_str[jsonp_filename] = jsonp
# Generate the nationwide results file.
logging.debug("Generating nationwide dict")
nationwide_dict = create_nationwide_dict(all_dicts, last_updated)
nationwide_dict["electoralData"] = electoral_dict["areas"]
all_dicts["United States"] = nationwide_dict
if upload:
logging.debug("Rendering JSONP")
us_jsonp = "US(%s)" % json.dumps(nationwide_dict)
jsonp_zip.writestr("us_general-%s.json" % file_timestamp, us_jsonp)
to_upload_str["us_general.json"] = us_jsonp
# Upload all queued JSONP files.
if upload and to_upload_str:
logging.debug("Uploading files")
conn = S3Connection(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
bucket = conn.create_bucket(S3_UPLOAD_BUCKET)
for jsonp_filename in to_upload_str:
k = Key(bucket)
k.key = S3_UPLOAD_PATH + jsonp_filename
k.set_contents_from_string(to_upload_str[jsonp_filename], {
'Content-Type': 'application/json',
})
k.set_acl('public-read')
# Close the archive storage.
raw_zip.close()
jsonp_zip.close()
logging.info("Complete.")
return all_dicts
# Thanks: http://code.activestate.com/recipes/
# 578022-wait-for-pid-and-check-for-pid-existance-posix/
def pid_exists(pid):
"""Check whether pid exists in the current process table."""
if pid < 0:
return False
try:
os.kill(pid, 0)
except OSError, e:
return e.errno == errno.EPERM
else:
return True
if __name__ == '__main__':
# Run the retrieval function, of course, but let's use a lock file to make
# sure we aren't already running. (If the FTP server's being a bit slow,
# the solution probably isn't to inadvertently have several instances of
# our script hitting it simultaneously just because cron called it again.)
#
# Lock file implementation modified from here:
# http://shoaibmir.wordpress.com/2009/12/14/pid-lock-file-in-python/
if os.access(LOCK_FILE_PATH, os.F_OK):
# The lock file exists. It should contain the process ID number, so
# let's read that...
lock_file = open(LOCK_FILE_PATH, 'r')
lock_file.seek(0)
old_pid = lock_file.readline()
lock_file.close()
# ...and see if a process with that ID exists. If so, exit with an
# error message. If not, remove the lock file and continue on our
# merry way.
if pid_exists(int(old_pid)):
# This error leads to some weird errors itself since we'd have two
# processes writing to the same file. We should do something about
# this.
logging.error(
"This script is already running with process ID " +
old_pid + ".")
# Before we exit, let's see how many times this has happened.
unlock_count = 0
try:
unlock_file = open(UNLOCK_WAIT_PATH, 'r')
unlock_file.seek(0)
unlock_count = int(unlock_file.readline())
unlock_file.close()
except (IOError, ValueError):
pass
# If we've had this problem fewer than UNLOCK_WAIT_COUNT times,
# increase the counter and get out of here. If not, kill the
# offending process, reset the counter, remove the lock file and
# get out of here anyway to avoid getting off schedule.
if unlock_count < UNLOCK_WAIT_COUNT:
logging.error("On wait %s of %s before killing. Exiting." % (
unlock_count, UNLOCK_WAIT_COUNT))
unlock_file = open(UNLOCK_WAIT_PATH, 'w')
unlock_file.write(str(unlock_count + 1))
unlock_file.close()
sys.exit(1)
else:
logging.error("Done waiting. Killing old process.")
os.kill(int(old_pid), signal.SIGKILL)
unlock_file = open(UNLOCK_WAIT_PATH, 'w')
unlock_file.write(str(0))
unlock_file.close()
sys.exit(2)
else:
logging.debug(
"Lock file exists, but process " + old_pid +
" doesn't exist. Removing.")
os.remove(LOCK_FILE_PATH)
# Now that we've checked whether the lock file exists (and removed it if
# it's safe to do so), let's create a new lock file...
lock_file = open(LOCK_FILE_PATH, 'w')
lock_file.write(str(os.getpid()))
lock_file.close()
# ...reset the unlock counter...
unlock_file = open(UNLOCK_WAIT_PATH, 'w')
unlock_file.write(str(0))
unlock_file.close()
# ...and process the data.
try:
if len(sys.argv) == 1:
process_states(sorted(STATE_TO_FIPS.keys()))
elif len(sys.argv) == 2:
if len(sys.argv[1]) == 2:
process_states([sys.argv[1]])
else:
process_states(
sorted(STATE_TO_FIPS.keys()), timestamp=sys.argv[1])
else:
process_states(sys.argv[1:])
except Exception:
logging.exception("Exception!")
# When we're done, remove the lock file.
os.remove(LOCK_FILE_PATH)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment