Last active
January 23, 2020 19:12
-
-
Save mappingvermont/6357bd2ad8dbc35ee67f8ba107cc0432 to your computer and use it in GitHub Desktop.
script to format tabular election CSV to GIS attribute table - all joining to county FIPS code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import geopandas as gpd | |
def clean_source_data(): | |
df = pd.read_csv('countypres_2000-2016.csv') | |
# remove year 2000 election results | |
df = df[df.year != 2000] | |
# rename state_po --> state_abbrev, and county --> name | |
# (better to standardize for county vs. state tilesets) | |
df = df.rename(columns={'state_po': 'state_abbrev', 'county': 'name'}) | |
# remove any rows that don't have a FIPS code- three such entities: | |
# 'Statewide writein' (CT) 'Maine UOCAVA' (ME), 'Federal Precinct' (RI) | |
# total of 78,759 votes across 2004 - 2016 elections | |
df = df[~df.FIPS.isnull()] | |
# make sure we read FIPS as an integer- not float | |
df['FIPS'] = df.FIPS.astype(int) | |
# correct the FIPS code for Oglala Lakota County, SD | |
# should be 46102, not 46113 | |
df.loc[df['FIPS'] == 46113, 'FIPS'] = 46102 | |
# fix Bedford, VA issue- it was an independent city (with FIPS 51515) | |
# from 2004 - 2012; merged back in to county 51019 in 2016 | |
# as our shapes are from 2016, we'll tabulate everything in 51019 | |
df.loc[df['FIPS'] == 51515, 'FIPS'] = 51019 | |
# the city of Kansas City, MO partially covers multiple counties, with the city | |
# itself reporting election results separate from any of the counties it touches. | |
# it appears that the NYT adds the Kansas City results to Jackson County MO, the | |
# county it covers most thoroughly. We'll do that as well. | |
# (Change weird 'Kansas City' FIPS to Jackson County MO FIPS | |
df.loc[df['FIPS'] == 36000, 'FIPS'] = 29095 | |
df.loc[df['FIPS'] == 29095, 'name'] = 'Jackson' | |
# Alaska reports their election results by State Rep district rather than by county | |
# Given redistricting after the 2012 election, these shapes aren't consistent for | |
# our time period. We'll report AK results only at the state level, just as the NYT does | |
df.loc[df['state'] == 'Alaska', 'FIPS'] = 2000 | |
df.loc[df['state'] == 'Alaska', 'name'] = 'Alaska' | |
# Alaska rows for District 99 (overseas / abroad) have a null value | |
# for state_abbrev- fix this as this is something we want in the final output | |
df.loc[df['state'] == 'Alaska', 'state_abbrev'] = 'AK' | |
# clean up / abbreviate party names | |
df.loc[df['party'].isnull(), 'party'] = 'other' | |
df['party'] = df.party.str[0:3] | |
# set all candidates votes to 0 if previously null | |
df.loc[df['candidatevotes'].isnull(), 'candidatevotes'] = 0 | |
return df.copy() | |
def build_land_area_df(): | |
census_county_src = gpd.read_file('tiger_source/tl_2019_us_county.shp') | |
census_county_src['FIPS'] = census_county_src.GEOID.astype(int) | |
# remove all territories / Puerto Rico, etc | |
census_county_src = census_county_src[census_county_src.FIPS < 57000] | |
# treat all of AK as the same FIPS code, merge Kalawao County, HI | |
census_county_src.loc[census_county_src['STATEFP'] == '02', 'FIPS'] = 2000 | |
census_county_src.loc[census_county_src['FIPS'] == 15005, 'FIPS'] = 15009 | |
# redenominate land_area from m2 --> km2 | |
census_county_src.ALAND = census_county_src.ALAND / 1e6 | |
# group and sum to combine AK data into one | |
census_county_src = census_county_src.groupby('FIPS').sum()['ALAND'].reset_index() | |
census_county_src.columns = ['FIPS', 'land_area_km2'] | |
return census_county_src | |
def build_results_df(df): | |
df = df.copy() | |
# turn year into a string so we can use it in our field names | |
df['year'] = df.year.astype('str') | |
df = df.groupby(['FIPS', 'name', 'state_abbrev', 'year', 'party']) \ | |
.sum()['candidatevotes'].reset_index() | |
# find the winning vote total for each FIPS / year combination | |
max_votes = df.groupby(['FIPS', 'year']).max()['candidatevotes'].reset_index() | |
max_votes.columns = ['FIPS', 'year', 'max_votes'] | |
# join max_votes for each year back to our main DF | |
# then use this to identify the winner for each FIPS / year | |
df = pd.merge(df, max_votes, on=['FIPS', 'year']) | |
df.loc[df.candidatevotes == df.max_votes, 'winner'] = df['party'] | |
# store winning party as a numeric value so we can properly group it | |
party_dict = {'dem': 0, 'oth': 1, 'rep': 2} | |
df['winner_numeric'] = df['winner'].map(party_dict) | |
# find second place votes to tabulate margin of victory | |
second_highest_votes = df[df.winner.isnull()].groupby(['FIPS', 'year']) \ | |
.max()['candidatevotes'].reset_index() | |
second_highest_votes.columns = ['FIPS', 'year', 'second_highest_votes'] | |
df = pd.merge(df, second_highest_votes, on=['FIPS', 'year']) | |
# save this summary df of party winners, max votes | |
# and runner up votes to ultimately join back to | |
summary_df = df[~df.winner.isnull()].copy() | |
summary_df = summary_df[['FIPS', 'year', 'max_votes', 'winner_numeric', 'second_highest_votes']] | |
# rename the 'party' columns to be {party}_count to help with column naming later | |
df['party'] = df['party'] + '_count' | |
# start by grouping by FIPS / year to tabulate winners / percentages | |
by_year = df.pivot_table(index=['FIPS', 'year'], columns=['party'], values='candidatevotes') \ | |
.reset_index() | |
# total all votes for each election | |
by_year['total_votes'] = by_year['dem_count'] + by_year['rep_count'] + by_year['oth_count'] | |
# join back to summary stats | |
by_year = pd.merge(by_year, summary_df, on=['FIPS', 'year']) | |
# margin of victory based on https://ballotpedia.org/Margin-of-victory_(MOV) | |
by_year['mov_count'] = by_year.max_votes - by_year.second_highest_votes | |
by_year['mov_pct'] = (( by_year.max_votes / by_year.total_votes ) - \ | |
( by_year.second_highest_votes / by_year.total_votes )) * 100 | |
# calculate percentages for each party | |
by_year['dem_pct'] = (by_year.dem_count / by_year.total_votes) * 100 | |
by_year['rep_pct'] = (by_year.rep_count / by_year.total_votes) * 100 | |
by_year['oth_pct'] = (by_year.oth_count / by_year.total_votes) * 100 | |
# pivot our data to our desired format- group by FIPS and year with many result columns | |
value_list = ['dem_count', 'rep_count', 'oth_count', | |
'dem_pct', 'rep_pct', 'oth_pct', 'total_votes', | |
'winner_numeric', 'mov_count', 'mov_pct'] | |
grouped = by_year.pivot_table(index=['FIPS'], columns=['year'], values=value_list) | |
# flatten our index to generate reasonable column names (2004_dem, 2004_rep, etc) | |
grouped.columns = ['_'.join([col[-1], col[0]]) for col in grouped.columns.values] | |
# map back to our actual party labels | |
party_dict_inverse = {v: k for k, v in party_dict.items()} | |
grouped['2004_winner'] = grouped['2004_winner_numeric'].map(party_dict_inverse) | |
grouped['2008_winner'] = grouped['2008_winner_numeric'].map(party_dict_inverse) | |
grouped['2012_winner'] = grouped['2012_winner_numeric'].map(party_dict_inverse) | |
grouped['2016_winner'] = grouped['2016_winner_numeric'].map(party_dict_inverse) | |
# add back in county_name and state | |
name_df = df.groupby(['FIPS', 'name', 'state_abbrev']).size().reset_index() | |
del name_df[0] | |
grouped = grouped.merge(name_df, on='FIPS') | |
return grouped | |
def write_csv(df, csv_name, is_state=False): | |
# rename to avoid QGIS conflict | |
df = df.rename(columns={'name': 'census_name'}) | |
field_list = ['FIPS', 'census_name', 'state_abbrev', 'land_area_km2'] | |
for year in range(2004, 2020, 4): | |
template_list = ['{}_winner', '{}_mov_count', '{}_mov_pct', | |
'{}_dem_count', '{}_dem_pct', '{}_rep_count', | |
'{}_rep_pct', '{}_oth_count', '{}_oth_pct', | |
'{}_total_votes', '{}_pop_count'] | |
if is_state: | |
template_list += ['{}_voter_turnout'] | |
field_list += [x.format(year) for x in template_list] | |
df = df[field_list] | |
df.to_csv(csv_name, header=True, index=False, float_format='%.1f') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def build_pop_df(): | |
# pull intercensal estimates from here: | |
# https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/county/co-est00int-tot.csv | |
y2000_2010_df = pd.read_csv('census_pop/co-est00int-tot.csv', encoding='latin') | |
# and current estimates from here: | |
# https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/totals/co-est2018-alldata.csv | |
y2010_2018_df = pd.read_csv('census_pop/co-est2018-alldata.csv', encoding='latin') | |
# remove all state level entries from these tables | |
y2000_2010_df = y2000_2010_df[y2000_2010_df.COUNTY != 0] | |
y2010_2018_df = y2010_2018_df[y2010_2018_df.COUNTY != 0] | |
# calculate FIPS, zfilling as necessary | |
y2000_2010_df['FIPS'] = (y2000_2010_df.STATE.astype(str).str.zfill(2) + | |
y2000_2010_df.COUNTY.astype(str).str.zfill(3)).astype(int) | |
y2010_2018_df['FIPS'] = (y2010_2018_df.STATE.astype(str).str.zfill(2) + | |
y2010_2018_df.COUNTY.astype(str).str.zfill(3)).astype(int) | |
# treat AK as one FIPS code | |
y2000_2010_df.loc[y2000_2010_df['STATE'] == 2, 'FIPS'] = 2000 | |
y2010_2018_df.loc[y2010_2018_df['STATE'] == 2, 'FIPS'] = 2000 | |
# merge Kalawao County with Maui county- election data only has Maui | |
y2000_2010_df.loc[y2000_2010_df['FIPS'] == 15005, 'FIPS'] = 15009 | |
y2010_2018_df.loc[y2010_2018_df['FIPS'] == 15005, 'FIPS'] = 15009 | |
# fix Oglala Lakota County, SD and Bedford, VA | |
y2000_2010_df.loc[y2000_2010_df['FIPS'] == 51515, 'FIPS'] = 51019 | |
y2000_2010_df.loc[y2000_2010_df['FIPS'] == 46113, 'FIPS'] = 46102 | |
# group and sum our population columns of interest after fixing FIPS issues | |
y2000_2010_df = y2000_2010_df.groupby('FIPS').sum()[['POPESTIMATE2004', 'POPESTIMATE2008']] | |
y2010_2018_df = y2010_2018_df.groupby('FIPS').sum()[['POPESTIMATE2012', 'POPESTIMATE2016']] | |
merged = pd.merge(y2000_2010_df, y2010_2018_df, on='FIPS').reset_index() | |
merged.columns = ['FIPS', '2004_pop_count', '2008_pop_count', '2012_pop_count', '2016_pop_count'] | |
return merged |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def build_turnout_df(): | |
df_1980_2014 = pd.read_csv('voter_turnout_data/1980_to_2014.csv') | |
df_2016 = pd.read_csv('voter_turnout_data/2016.csv').reset_index() | |
df_1980_2014 = df_1980_2014[df_1980_2014.Year.isin([2004, 2008, 2012])] | |
df_1980_2014 = df_1980_2014[['Year', 'Unnamed: 3', 'VEP Highest Office']] | |
df_1980_2014.columns = ['year', 'name', 'voter_turnout_pct'] | |
df_2016['Year'] = 2016 | |
df_2016 = df_2016[['Year', 'index', 'VEP Highest Office']] | |
df_2016.columns = ['year', 'name', 'voter_turnout_pct'] | |
df = pd.concat([df_1980_2014, df_2016]) | |
# remove all US results and note in 2016 data | |
df = df[~(df.name == 'United States')] | |
df = df[~df.voter_turnout_pct.isnull()] | |
# add proper column name to year column before pivoting | |
df['year'] = df['year'].astype(str) + '_voter_turnout' | |
# convert voter_turnout_pct to float | |
df.voter_turnout_pct = df.voter_turnout_pct.str.rstrip('%').astype(float) | |
return df.pivot_table(index='name', columns='year', values='voter_turnout_pct').reset_index() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def qc_county_results(): | |
print("QCing county output") | |
df = pd.read_csv('historical_election_data_by_county.csv') | |
# confirm that all our percentages add to ~100 | |
df['2004_total_pct'] = df['2004_dem_pct'] + df['2004_rep_pct'] + df['2004_oth_pct'] | |
df['2008_total_pct'] = df['2008_dem_pct'] + df['2008_rep_pct'] + df['2008_oth_pct'] | |
df['2012_total_pct'] = df['2012_dem_pct'] + df['2012_rep_pct'] + df['2012_oth_pct'] | |
df['2016_total_pct'] = df['2016_dem_pct'] + df['2016_rep_pct'] + df['2016_oth_pct'] | |
assert df[( df['2004_total_pct'] <= 99.8 ) | ( df['2004_total_pct'] >= 100.2 ) ].empty | |
assert df[( df['2008_total_pct'] <= 99.8 ) | ( df['2008_total_pct'] >= 100.2 ) ].empty | |
assert df[( df['2012_total_pct'] <= 99.8 ) | ( df['2012_total_pct'] >= 100.2 ) ].empty | |
assert df[( df['2016_total_pct'] <= 99.8 ) | ( df['2016_total_pct'] >= 100.2 ) ].empty | |
# expecting the normal 3142 US / state counties | |
# minus 29 AK counties | |
# minus 1 tiny HI county | |
# plus 1 record for all of AK | |
assert df.shape[0] == 3113 | |
# add votes by county compare to total votes in source df | |
source_df = pd.read_csv('countypres_2000-2016.csv') | |
source_df = source_df[source_df.year != 2000] | |
# group to get only total votes, year and FIPS (for some reason total votes in this normalized form) | |
source_df_g = source_df.groupby(['year', 'FIPS', 'totalvotes']).size().reset_index() | |
# pivot to match our data format | |
pivoted = source_df_g.pivot_table(index=['FIPS'], columns='year', values='totalvotes') | |
pivoted.columns = ['2004_qc_total', '2008_qc_total', '2012_qc_total', '2016_qc_total'] | |
merged = df.join(pivoted, on='FIPS') | |
merged['2004_new_total'] = merged['2004_dem_count'] + merged['2004_rep_count'] + merged['2004_oth_count'] | |
merged['2008_new_total'] = merged['2008_dem_count'] + merged['2008_rep_count'] + merged['2008_oth_count'] | |
merged['2012_new_total'] = merged['2012_dem_count'] + merged['2012_rep_count'] + merged['2012_oth_count'] | |
merged['2016_new_total'] = merged['2016_dem_count'] + merged['2016_rep_count'] + merged['2016_oth_count'] | |
# we expect two mismatches here due to county combinations- moving Kansas City MO into Jackson MO, | |
# and the city of Bedford VA into the county of Bedford | |
assert merged[(merged['2004_new_total'] != merged['2004_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2 | |
assert merged[(merged['2008_new_total'] != merged['2008_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2 | |
assert merged[(merged['2012_new_total'] != merged['2012_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2 | |
# 2016 total votes != the sum of candidate votes for Gila AZ, Graham AZ, Greenlee AZ and Keya Paha NE | |
# Not immediately clear why this is; we'll stick with using the candidate votes to display our results | |
# we're looking for 5 mismatches here - Bedford VA was merged back into the county in 2016, so it's | |
# just the 4 mismatches, plus Kansas City --> Jackson MO | |
assert merged[(merged['2016_new_total'] != merged['2016_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 5 | |
# we'll have two rows we can't join to our source data- combined AK, and the Oglala Lakota County in SD, | |
# which has a new / changed FIPS code | |
assert merged[ | |
(merged['2004_qc_total'].isnull()) | | |
(merged['2008_qc_total'].isnull()) | | |
(merged['2012_qc_total'].isnull()) | | |
(merged['2016_qc_total'].isnull()) | |
].shape[0] == 2 | |
# finally, check counties we dropped from the source data | |
# should be 41 AK districts, plus Kansas City, Bedford VA and Oglala Lakota | |
pivoted = pivoted.reset_index() | |
joined_back_to_source = pd.merge(pivoted, df, on='FIPS', how='left') | |
# look for anything in the AK FIPS code range. AK state FIPS is 02, which gets truncated to 2, | |
# with the remaining 3 digits for county / district FIPS | |
assert joined_back_to_source[ | |
(joined_back_to_source.census_name.isnull()) & | |
(joined_back_to_source.FIPS > 2000) & | |
(joined_back_to_source.FIPS < 3000) | |
].shape[0] == 41 | |
# KC with that weird 36000 FIPS code | |
# Oglala Lakota county FIPS change | |
# Bedford VA merging back to the larger surrounding county | |
expected_missing_fips = [36000, 46113, 51515] | |
assert joined_back_to_source[ | |
(joined_back_to_source.census_name.isnull()) & | |
(joined_back_to_source.FIPS.isin(expected_missing_fips) ) | |
].shape[0] == 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import format_election_results | |
from format_population_data import build_pop_df | |
from format_voter_turnout import build_turnout_df | |
from qc_checks import qc_county_results | |
county_source_df = format_election_results.clean_source_data() | |
county_results_df = format_election_results.build_results_df(county_source_df) | |
county_pop_df = build_pop_df() | |
county_land_area_df = format_election_results.build_land_area_df() | |
# join to population data and write to CSV | |
final_county_df = pd.merge(county_results_df, county_pop_df, on='FIPS') \ | |
.merge(county_land_area_df, on='FIPS') | |
format_election_results.write_csv(final_county_df, 'historical_election_data_by_county.csv') | |
# start building out state results up from county data | |
state_source_df = county_source_df.copy() | |
state_pop_df = county_pop_df.copy() | |
state_land_area_df = county_land_area_df.copy() | |
# build state FIPS codes by zfilling integer values to 5, then taking first two digits | |
state_source_df.FIPS = state_source_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int) | |
state_pop_df.FIPS = state_pop_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int) | |
state_land_area_df.FIPS = state_land_area_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int) | |
# group by state, summing county population for our target years | |
state_pop_df = state_pop_df.groupby('FIPS').sum()[state_pop_df.columns[1:]] | |
state_land_area_df = state_land_area_df.groupby('FIPS').sum()['land_area_km2'].reset_index() | |
# use state for the name instead of county | |
state_source_df.name = state_source_df.state | |
# group and sum results, assign winners | |
state_results_df = format_election_results.build_results_df(state_source_df) | |
# combine with population data | |
state_with_pop_and_land = pd.merge(state_results_df, state_pop_df, on='FIPS') \ | |
.merge(state_land_area_df, on='FIPS') | |
# add voter turnout data | |
voter_turnout_df = build_turnout_df() | |
final_state_df = pd.merge(state_with_pop_and_land, voter_turnout_df, on='name') | |
format_election_results.write_csv(final_state_df, 'historical_election_data_by_state.csv', is_state=True) | |
# run QC on the county table | |
qc_county_results() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment