Skip to content

Instantly share code, notes, and snippets.

@mappingvermont
Last active January 23, 2020 19:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mappingvermont/6357bd2ad8dbc35ee67f8ba107cc0432 to your computer and use it in GitHub Desktop.
Save mappingvermont/6357bd2ad8dbc35ee67f8ba107cc0432 to your computer and use it in GitHub Desktop.
script to format tabular election CSV to GIS attribute table - all joining to county FIPS code
import pandas as pd
import geopandas as gpd
def clean_source_data():
df = pd.read_csv('countypres_2000-2016.csv')
# remove year 2000 election results
df = df[df.year != 2000]
# rename state_po --> state_abbrev, and county --> name
# (better to standardize for county vs. state tilesets)
df = df.rename(columns={'state_po': 'state_abbrev', 'county': 'name'})
# remove any rows that don't have a FIPS code- three such entities:
# 'Statewide writein' (CT) 'Maine UOCAVA' (ME), 'Federal Precinct' (RI)
# total of 78,759 votes across 2004 - 2016 elections
df = df[~df.FIPS.isnull()]
# make sure we read FIPS as an integer- not float
df['FIPS'] = df.FIPS.astype(int)
# correct the FIPS code for Oglala Lakota County, SD
# should be 46102, not 46113
df.loc[df['FIPS'] == 46113, 'FIPS'] = 46102
# fix Bedford, VA issue- it was an independent city (with FIPS 51515)
# from 2004 - 2012; merged back in to county 51019 in 2016
# as our shapes are from 2016, we'll tabulate everything in 51019
df.loc[df['FIPS'] == 51515, 'FIPS'] = 51019
# the city of Kansas City, MO partially covers multiple counties, with the city
# itself reporting election results separate from any of the counties it touches.
# it appears that the NYT adds the Kansas City results to Jackson County MO, the
# county it covers most thoroughly. We'll do that as well.
# (Change weird 'Kansas City' FIPS to Jackson County MO FIPS
df.loc[df['FIPS'] == 36000, 'FIPS'] = 29095
df.loc[df['FIPS'] == 29095, 'name'] = 'Jackson'
# Alaska reports their election results by State Rep district rather than by county
# Given redistricting after the 2012 election, these shapes aren't consistent for
# our time period. We'll report AK results only at the state level, just as the NYT does
df.loc[df['state'] == 'Alaska', 'FIPS'] = 2000
df.loc[df['state'] == 'Alaska', 'name'] = 'Alaska'
# Alaska rows for District 99 (overseas / abroad) have a null value
# for state_abbrev- fix this as this is something we want in the final output
df.loc[df['state'] == 'Alaska', 'state_abbrev'] = 'AK'
# clean up / abbreviate party names
df.loc[df['party'].isnull(), 'party'] = 'other'
df['party'] = df.party.str[0:3]
# set all candidates votes to 0 if previously null
df.loc[df['candidatevotes'].isnull(), 'candidatevotes'] = 0
return df.copy()
def build_land_area_df():
census_county_src = gpd.read_file('tiger_source/tl_2019_us_county.shp')
census_county_src['FIPS'] = census_county_src.GEOID.astype(int)
# remove all territories / Puerto Rico, etc
census_county_src = census_county_src[census_county_src.FIPS < 57000]
# treat all of AK as the same FIPS code, merge Kalawao County, HI
census_county_src.loc[census_county_src['STATEFP'] == '02', 'FIPS'] = 2000
census_county_src.loc[census_county_src['FIPS'] == 15005, 'FIPS'] = 15009
# redenominate land_area from m2 --> km2
census_county_src.ALAND = census_county_src.ALAND / 1e6
# group and sum to combine AK data into one
census_county_src = census_county_src.groupby('FIPS').sum()['ALAND'].reset_index()
census_county_src.columns = ['FIPS', 'land_area_km2']
return census_county_src
def build_results_df(df):
df = df.copy()
# turn year into a string so we can use it in our field names
df['year'] = df.year.astype('str')
df = df.groupby(['FIPS', 'name', 'state_abbrev', 'year', 'party']) \
.sum()['candidatevotes'].reset_index()
# find the winning vote total for each FIPS / year combination
max_votes = df.groupby(['FIPS', 'year']).max()['candidatevotes'].reset_index()
max_votes.columns = ['FIPS', 'year', 'max_votes']
# join max_votes for each year back to our main DF
# then use this to identify the winner for each FIPS / year
df = pd.merge(df, max_votes, on=['FIPS', 'year'])
df.loc[df.candidatevotes == df.max_votes, 'winner'] = df['party']
# store winning party as a numeric value so we can properly group it
party_dict = {'dem': 0, 'oth': 1, 'rep': 2}
df['winner_numeric'] = df['winner'].map(party_dict)
# find second place votes to tabulate margin of victory
second_highest_votes = df[df.winner.isnull()].groupby(['FIPS', 'year']) \
.max()['candidatevotes'].reset_index()
second_highest_votes.columns = ['FIPS', 'year', 'second_highest_votes']
df = pd.merge(df, second_highest_votes, on=['FIPS', 'year'])
# save this summary df of party winners, max votes
# and runner up votes to ultimately join back to
summary_df = df[~df.winner.isnull()].copy()
summary_df = summary_df[['FIPS', 'year', 'max_votes', 'winner_numeric', 'second_highest_votes']]
# rename the 'party' columns to be {party}_count to help with column naming later
df['party'] = df['party'] + '_count'
# start by grouping by FIPS / year to tabulate winners / percentages
by_year = df.pivot_table(index=['FIPS', 'year'], columns=['party'], values='candidatevotes') \
.reset_index()
# total all votes for each election
by_year['total_votes'] = by_year['dem_count'] + by_year['rep_count'] + by_year['oth_count']
# join back to summary stats
by_year = pd.merge(by_year, summary_df, on=['FIPS', 'year'])
# margin of victory based on https://ballotpedia.org/Margin-of-victory_(MOV)
by_year['mov_count'] = by_year.max_votes - by_year.second_highest_votes
by_year['mov_pct'] = (( by_year.max_votes / by_year.total_votes ) - \
( by_year.second_highest_votes / by_year.total_votes )) * 100
# calculate percentages for each party
by_year['dem_pct'] = (by_year.dem_count / by_year.total_votes) * 100
by_year['rep_pct'] = (by_year.rep_count / by_year.total_votes) * 100
by_year['oth_pct'] = (by_year.oth_count / by_year.total_votes) * 100
# pivot our data to our desired format- group by FIPS and year with many result columns
value_list = ['dem_count', 'rep_count', 'oth_count',
'dem_pct', 'rep_pct', 'oth_pct', 'total_votes',
'winner_numeric', 'mov_count', 'mov_pct']
grouped = by_year.pivot_table(index=['FIPS'], columns=['year'], values=value_list)
# flatten our index to generate reasonable column names (2004_dem, 2004_rep, etc)
grouped.columns = ['_'.join([col[-1], col[0]]) for col in grouped.columns.values]
# map back to our actual party labels
party_dict_inverse = {v: k for k, v in party_dict.items()}
grouped['2004_winner'] = grouped['2004_winner_numeric'].map(party_dict_inverse)
grouped['2008_winner'] = grouped['2008_winner_numeric'].map(party_dict_inverse)
grouped['2012_winner'] = grouped['2012_winner_numeric'].map(party_dict_inverse)
grouped['2016_winner'] = grouped['2016_winner_numeric'].map(party_dict_inverse)
# add back in county_name and state
name_df = df.groupby(['FIPS', 'name', 'state_abbrev']).size().reset_index()
del name_df[0]
grouped = grouped.merge(name_df, on='FIPS')
return grouped
def write_csv(df, csv_name, is_state=False):
# rename to avoid QGIS conflict
df = df.rename(columns={'name': 'census_name'})
field_list = ['FIPS', 'census_name', 'state_abbrev', 'land_area_km2']
for year in range(2004, 2020, 4):
template_list = ['{}_winner', '{}_mov_count', '{}_mov_pct',
'{}_dem_count', '{}_dem_pct', '{}_rep_count',
'{}_rep_pct', '{}_oth_count', '{}_oth_pct',
'{}_total_votes', '{}_pop_count']
if is_state:
template_list += ['{}_voter_turnout']
field_list += [x.format(year) for x in template_list]
df = df[field_list]
df.to_csv(csv_name, header=True, index=False, float_format='%.1f')
import pandas as pd
def build_pop_df():
# pull intercensal estimates from here:
# https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/county/co-est00int-tot.csv
y2000_2010_df = pd.read_csv('census_pop/co-est00int-tot.csv', encoding='latin')
# and current estimates from here:
# https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/totals/co-est2018-alldata.csv
y2010_2018_df = pd.read_csv('census_pop/co-est2018-alldata.csv', encoding='latin')
# remove all state level entries from these tables
y2000_2010_df = y2000_2010_df[y2000_2010_df.COUNTY != 0]
y2010_2018_df = y2010_2018_df[y2010_2018_df.COUNTY != 0]
# calculate FIPS, zfilling as necessary
y2000_2010_df['FIPS'] = (y2000_2010_df.STATE.astype(str).str.zfill(2) +
y2000_2010_df.COUNTY.astype(str).str.zfill(3)).astype(int)
y2010_2018_df['FIPS'] = (y2010_2018_df.STATE.astype(str).str.zfill(2) +
y2010_2018_df.COUNTY.astype(str).str.zfill(3)).astype(int)
# treat AK as one FIPS code
y2000_2010_df.loc[y2000_2010_df['STATE'] == 2, 'FIPS'] = 2000
y2010_2018_df.loc[y2010_2018_df['STATE'] == 2, 'FIPS'] = 2000
# merge Kalawao County with Maui county- election data only has Maui
y2000_2010_df.loc[y2000_2010_df['FIPS'] == 15005, 'FIPS'] = 15009
y2010_2018_df.loc[y2010_2018_df['FIPS'] == 15005, 'FIPS'] = 15009
# fix Oglala Lakota County, SD and Bedford, VA
y2000_2010_df.loc[y2000_2010_df['FIPS'] == 51515, 'FIPS'] = 51019
y2000_2010_df.loc[y2000_2010_df['FIPS'] == 46113, 'FIPS'] = 46102
# group and sum our population columns of interest after fixing FIPS issues
y2000_2010_df = y2000_2010_df.groupby('FIPS').sum()[['POPESTIMATE2004', 'POPESTIMATE2008']]
y2010_2018_df = y2010_2018_df.groupby('FIPS').sum()[['POPESTIMATE2012', 'POPESTIMATE2016']]
merged = pd.merge(y2000_2010_df, y2010_2018_df, on='FIPS').reset_index()
merged.columns = ['FIPS', '2004_pop_count', '2008_pop_count', '2012_pop_count', '2016_pop_count']
return merged
import pandas as pd
def build_turnout_df():
df_1980_2014 = pd.read_csv('voter_turnout_data/1980_to_2014.csv')
df_2016 = pd.read_csv('voter_turnout_data/2016.csv').reset_index()
df_1980_2014 = df_1980_2014[df_1980_2014.Year.isin([2004, 2008, 2012])]
df_1980_2014 = df_1980_2014[['Year', 'Unnamed: 3', 'VEP Highest Office']]
df_1980_2014.columns = ['year', 'name', 'voter_turnout_pct']
df_2016['Year'] = 2016
df_2016 = df_2016[['Year', 'index', 'VEP Highest Office']]
df_2016.columns = ['year', 'name', 'voter_turnout_pct']
df = pd.concat([df_1980_2014, df_2016])
# remove all US results and note in 2016 data
df = df[~(df.name == 'United States')]
df = df[~df.voter_turnout_pct.isnull()]
# add proper column name to year column before pivoting
df['year'] = df['year'].astype(str) + '_voter_turnout'
# convert voter_turnout_pct to float
df.voter_turnout_pct = df.voter_turnout_pct.str.rstrip('%').astype(float)
return df.pivot_table(index='name', columns='year', values='voter_turnout_pct').reset_index()
import pandas as pd
def qc_county_results():
print("QCing county output")
df = pd.read_csv('historical_election_data_by_county.csv')
# confirm that all our percentages add to ~100
df['2004_total_pct'] = df['2004_dem_pct'] + df['2004_rep_pct'] + df['2004_oth_pct']
df['2008_total_pct'] = df['2008_dem_pct'] + df['2008_rep_pct'] + df['2008_oth_pct']
df['2012_total_pct'] = df['2012_dem_pct'] + df['2012_rep_pct'] + df['2012_oth_pct']
df['2016_total_pct'] = df['2016_dem_pct'] + df['2016_rep_pct'] + df['2016_oth_pct']
assert df[( df['2004_total_pct'] <= 99.8 ) | ( df['2004_total_pct'] >= 100.2 ) ].empty
assert df[( df['2008_total_pct'] <= 99.8 ) | ( df['2008_total_pct'] >= 100.2 ) ].empty
assert df[( df['2012_total_pct'] <= 99.8 ) | ( df['2012_total_pct'] >= 100.2 ) ].empty
assert df[( df['2016_total_pct'] <= 99.8 ) | ( df['2016_total_pct'] >= 100.2 ) ].empty
# expecting the normal 3142 US / state counties
# minus 29 AK counties
# minus 1 tiny HI county
# plus 1 record for all of AK
assert df.shape[0] == 3113
# add votes by county compare to total votes in source df
source_df = pd.read_csv('countypres_2000-2016.csv')
source_df = source_df[source_df.year != 2000]
# group to get only total votes, year and FIPS (for some reason total votes in this normalized form)
source_df_g = source_df.groupby(['year', 'FIPS', 'totalvotes']).size().reset_index()
# pivot to match our data format
pivoted = source_df_g.pivot_table(index=['FIPS'], columns='year', values='totalvotes')
pivoted.columns = ['2004_qc_total', '2008_qc_total', '2012_qc_total', '2016_qc_total']
merged = df.join(pivoted, on='FIPS')
merged['2004_new_total'] = merged['2004_dem_count'] + merged['2004_rep_count'] + merged['2004_oth_count']
merged['2008_new_total'] = merged['2008_dem_count'] + merged['2008_rep_count'] + merged['2008_oth_count']
merged['2012_new_total'] = merged['2012_dem_count'] + merged['2012_rep_count'] + merged['2012_oth_count']
merged['2016_new_total'] = merged['2016_dem_count'] + merged['2016_rep_count'] + merged['2016_oth_count']
# we expect two mismatches here due to county combinations- moving Kansas City MO into Jackson MO,
# and the city of Bedford VA into the county of Bedford
assert merged[(merged['2004_new_total'] != merged['2004_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2
assert merged[(merged['2008_new_total'] != merged['2008_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2
assert merged[(merged['2012_new_total'] != merged['2012_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2
# 2016 total votes != the sum of candidate votes for Gila AZ, Graham AZ, Greenlee AZ and Keya Paha NE
# Not immediately clear why this is; we'll stick with using the candidate votes to display our results
# we're looking for 5 mismatches here - Bedford VA was merged back into the county in 2016, so it's
# just the 4 mismatches, plus Kansas City --> Jackson MO
assert merged[(merged['2016_new_total'] != merged['2016_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 5
# we'll have two rows we can't join to our source data- combined AK, and the Oglala Lakota County in SD,
# which has a new / changed FIPS code
assert merged[
(merged['2004_qc_total'].isnull()) |
(merged['2008_qc_total'].isnull()) |
(merged['2012_qc_total'].isnull()) |
(merged['2016_qc_total'].isnull())
].shape[0] == 2
# finally, check counties we dropped from the source data
# should be 41 AK districts, plus Kansas City, Bedford VA and Oglala Lakota
pivoted = pivoted.reset_index()
joined_back_to_source = pd.merge(pivoted, df, on='FIPS', how='left')
# look for anything in the AK FIPS code range. AK state FIPS is 02, which gets truncated to 2,
# with the remaining 3 digits for county / district FIPS
assert joined_back_to_source[
(joined_back_to_source.census_name.isnull()) &
(joined_back_to_source.FIPS > 2000) &
(joined_back_to_source.FIPS < 3000)
].shape[0] == 41
# KC with that weird 36000 FIPS code
# Oglala Lakota county FIPS change
# Bedford VA merging back to the larger surrounding county
expected_missing_fips = [36000, 46113, 51515]
assert joined_back_to_source[
(joined_back_to_source.census_name.isnull()) &
(joined_back_to_source.FIPS.isin(expected_missing_fips) )
].shape[0] == 3
import pandas as pd
import format_election_results
from format_population_data import build_pop_df
from format_voter_turnout import build_turnout_df
from qc_checks import qc_county_results
county_source_df = format_election_results.clean_source_data()
county_results_df = format_election_results.build_results_df(county_source_df)
county_pop_df = build_pop_df()
county_land_area_df = format_election_results.build_land_area_df()
# join to population data and write to CSV
final_county_df = pd.merge(county_results_df, county_pop_df, on='FIPS') \
.merge(county_land_area_df, on='FIPS')
format_election_results.write_csv(final_county_df, 'historical_election_data_by_county.csv')
# start building out state results up from county data
state_source_df = county_source_df.copy()
state_pop_df = county_pop_df.copy()
state_land_area_df = county_land_area_df.copy()
# build state FIPS codes by zfilling integer values to 5, then taking first two digits
state_source_df.FIPS = state_source_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)
state_pop_df.FIPS = state_pop_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)
state_land_area_df.FIPS = state_land_area_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)
# group by state, summing county population for our target years
state_pop_df = state_pop_df.groupby('FIPS').sum()[state_pop_df.columns[1:]]
state_land_area_df = state_land_area_df.groupby('FIPS').sum()['land_area_km2'].reset_index()
# use state for the name instead of county
state_source_df.name = state_source_df.state
# group and sum results, assign winners
state_results_df = format_election_results.build_results_df(state_source_df)
# combine with population data
state_with_pop_and_land = pd.merge(state_results_df, state_pop_df, on='FIPS') \
.merge(state_land_area_df, on='FIPS')
# add voter turnout data
voter_turnout_df = build_turnout_df()
final_state_df = pd.merge(state_with_pop_and_land, voter_turnout_df, on='name')
format_election_results.write_csv(final_state_df, 'historical_election_data_by_state.csv', is_state=True)
# run QC on the county table
qc_county_results()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment