mappingvermont/format_election_results.py

## format_election_results.py
import pandas as pd
import geopandas as gpd


def clean_source_data():

    df = pd.read_csv('countypres_2000-2016.csv')

    # remove year 2000 election results
    df = df[df.year != 2000]

    # rename state_po --> state_abbrev, and county --> name
    # (better to standardize for county vs. state tilesets)
    df = df.rename(columns={'state_po': 'state_abbrev', 'county': 'name'})

    # remove any rows that don't have a FIPS code- three such entities:
    # 'Statewide writein' (CT) 'Maine UOCAVA' (ME), 'Federal Precinct' (RI)
    # total of 78,759 votes across 2004 - 2016 elections
    df = df[~df.FIPS.isnull()]

    # make sure we read FIPS as an integer- not float
    df['FIPS'] = df.FIPS.astype(int)

    # correct the FIPS code for Oglala Lakota County, SD
    # should be 46102, not 46113
    df.loc[df['FIPS'] == 46113, 'FIPS'] = 46102

    # fix Bedford, VA issue- it was an independent city (with FIPS 51515)
    # from 2004 - 2012; merged back in to county 51019 in 2016
    # as our shapes are from 2016, we'll tabulate everything in 51019
    df.loc[df['FIPS'] == 51515, 'FIPS'] = 51019

    # the city of Kansas City, MO partially covers multiple counties, with the city
    # itself reporting election results separate from any of the counties it touches.
    # it appears that the NYT adds the Kansas City results to Jackson County MO, the
    # county it covers most thoroughly. We'll do that as well.
    # (Change weird 'Kansas City' FIPS to Jackson County MO FIPS
    df.loc[df['FIPS'] == 36000, 'FIPS'] = 29095
    df.loc[df['FIPS'] == 29095, 'name'] = 'Jackson'

    # Alaska reports their election results by State Rep district rather than by county
    # Given redistricting after the 2012 election, these shapes aren't consistent for
    # our time period. We'll report AK results only at the state level, just as the NYT does
    df.loc[df['state'] == 'Alaska', 'FIPS'] = 2000
    df.loc[df['state'] == 'Alaska', 'name'] = 'Alaska'

    # Alaska rows for District 99 (overseas / abroad) have a null value
    # for state_abbrev- fix this as this is something we want in the final output
    df.loc[df['state'] == 'Alaska', 'state_abbrev'] = 'AK'

    # clean up / abbreviate party names
    df.loc[df['party'].isnull(), 'party'] = 'other'
    df['party'] = df.party.str[0:3]

    # set all candidates votes to 0 if previously null
    df.loc[df['candidatevotes'].isnull(), 'candidatevotes'] = 0

    return df.copy()


def build_land_area_df():

    census_county_src = gpd.read_file('tiger_source/tl_2019_us_county.shp')
    census_county_src['FIPS'] = census_county_src.GEOID.astype(int)

    # remove all territories / Puerto Rico, etc
    census_county_src = census_county_src[census_county_src.FIPS < 57000]

    # treat all of AK as the same FIPS code, merge Kalawao County, HI
    census_county_src.loc[census_county_src['STATEFP'] == '02', 'FIPS'] = 2000
    census_county_src.loc[census_county_src['FIPS'] == 15005, 'FIPS'] = 15009

    # redenominate land_area from m2 --> km2
    census_county_src.ALAND = census_county_src.ALAND / 1e6

    # group and sum to combine AK data into one
    census_county_src = census_county_src.groupby('FIPS').sum()['ALAND'].reset_index()
    census_county_src.columns = ['FIPS', 'land_area_km2']

    return census_county_src


def build_results_df(df):

    df = df.copy()

    # turn year into a string so we can use it in our field names
    df['year'] = df.year.astype('str')

    df = df.groupby(['FIPS', 'name', 'state_abbrev', 'year', 'party']) \
           .sum()['candidatevotes'].reset_index()

    # find the winning vote total for each FIPS / year combination
    max_votes = df.groupby(['FIPS', 'year']).max()['candidatevotes'].reset_index()
    max_votes.columns = ['FIPS', 'year', 'max_votes']

    # join max_votes for each year back to our main DF
    # then use this to identify the winner for each FIPS / year
    df = pd.merge(df, max_votes, on=['FIPS', 'year'])
    df.loc[df.candidatevotes == df.max_votes, 'winner'] = df['party']

    # store winning party as a numeric value so we can properly group it
    party_dict = {'dem': 0, 'oth': 1, 'rep': 2}
    df['winner_numeric'] = df['winner'].map(party_dict)

    # find second place votes to tabulate margin of victory
    second_highest_votes = df[df.winner.isnull()].groupby(['FIPS', 'year']) \
                             .max()['candidatevotes'].reset_index()
    second_highest_votes.columns = ['FIPS', 'year', 'second_highest_votes']
    df = pd.merge(df, second_highest_votes, on=['FIPS', 'year'])

    # save this summary df of party winners, max votes
    # and runner up votes to ultimately join back to
    summary_df = df[~df.winner.isnull()].copy()
    summary_df = summary_df[['FIPS', 'year', 'max_votes', 'winner_numeric', 'second_highest_votes']]

    # rename the 'party' columns to be {party}_count to help with column naming later
    df['party'] = df['party'] + '_count'

    # start by grouping by FIPS / year to tabulate winners / percentages
    by_year = df.pivot_table(index=['FIPS', 'year'], columns=['party'], values='candidatevotes') \
                .reset_index()

    # total all votes for each election
    by_year['total_votes'] = by_year['dem_count'] + by_year['rep_count'] + by_year['oth_count']

    # join back to summary stats
    by_year = pd.merge(by_year, summary_df, on=['FIPS', 'year'])

    # margin of victory based on https://ballotpedia.org/Margin-of-victory_(MOV)
    by_year['mov_count'] = by_year.max_votes - by_year.second_highest_votes
    by_year['mov_pct'] = (( by_year.max_votes / by_year.total_votes ) - \
                                       ( by_year.second_highest_votes / by_year.total_votes )) * 100

    # calculate percentages for each party
    by_year['dem_pct'] = (by_year.dem_count / by_year.total_votes) * 100
    by_year['rep_pct'] = (by_year.rep_count / by_year.total_votes) * 100
    by_year['oth_pct'] = (by_year.oth_count / by_year.total_votes) * 100

    # pivot our data to our desired format- group by FIPS and year with many result columns
    value_list = ['dem_count', 'rep_count', 'oth_count',
                  'dem_pct', 'rep_pct', 'oth_pct', 'total_votes',
                  'winner_numeric', 'mov_count', 'mov_pct']
    grouped = by_year.pivot_table(index=['FIPS'], columns=['year'], values=value_list)

    # flatten our index to generate reasonable column names (2004_dem, 2004_rep, etc)
    grouped.columns = ['_'.join([col[-1], col[0]]) for col in grouped.columns.values]

    # map back to our actual party labels
    party_dict_inverse = {v: k for k, v in party_dict.items()}
    grouped['2004_winner'] = grouped['2004_winner_numeric'].map(party_dict_inverse)
    grouped['2008_winner'] = grouped['2008_winner_numeric'].map(party_dict_inverse)
    grouped['2012_winner'] = grouped['2012_winner_numeric'].map(party_dict_inverse)
    grouped['2016_winner'] = grouped['2016_winner_numeric'].map(party_dict_inverse)

    # add back in county_name and state
    name_df = df.groupby(['FIPS', 'name', 'state_abbrev']).size().reset_index()
    del name_df[0]

    grouped = grouped.merge(name_df, on='FIPS')

    return grouped


def write_csv(df, csv_name, is_state=False):

    # rename to avoid QGIS conflict
    df = df.rename(columns={'name': 'census_name'})

    field_list = ['FIPS', 'census_name', 'state_abbrev', 'land_area_km2']

    for year in range(2004, 2020, 4):
        template_list = ['{}_winner', '{}_mov_count', '{}_mov_pct',
                         '{}_dem_count', '{}_dem_pct', '{}_rep_count',
                         '{}_rep_pct', '{}_oth_count', '{}_oth_pct',
                         '{}_total_votes', '{}_pop_count']

        if is_state:
            template_list += ['{}_voter_turnout']

        field_list += [x.format(year) for x in template_list]

    df = df[field_list]

    df.to_csv(csv_name, header=True, index=False, float_format='%.1f')


## format_population_data.py
import pandas as pd


def build_pop_df():

    # pull intercensal estimates from here:
    # https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/county/co-est00int-tot.csv
    y2000_2010_df = pd.read_csv('census_pop/co-est00int-tot.csv', encoding='latin')

    # and current estimates from here:
    # https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/totals/co-est2018-alldata.csv
    y2010_2018_df = pd.read_csv('census_pop/co-est2018-alldata.csv', encoding='latin')

    # remove all state level entries from these tables
    y2000_2010_df = y2000_2010_df[y2000_2010_df.COUNTY != 0]
    y2010_2018_df = y2010_2018_df[y2010_2018_df.COUNTY != 0]

    # calculate FIPS, zfilling as necessary
    y2000_2010_df['FIPS'] = (y2000_2010_df.STATE.astype(str).str.zfill(2) +
                             y2000_2010_df.COUNTY.astype(str).str.zfill(3)).astype(int)

    y2010_2018_df['FIPS'] = (y2010_2018_df.STATE.astype(str).str.zfill(2) +
                             y2010_2018_df.COUNTY.astype(str).str.zfill(3)).astype(int)

    # treat AK as one FIPS code
    y2000_2010_df.loc[y2000_2010_df['STATE'] == 2, 'FIPS'] = 2000
    y2010_2018_df.loc[y2010_2018_df['STATE'] == 2, 'FIPS'] = 2000

    # merge Kalawao County with Maui county- election data only has Maui
    y2000_2010_df.loc[y2000_2010_df['FIPS'] == 15005, 'FIPS'] = 15009
    y2010_2018_df.loc[y2010_2018_df['FIPS'] == 15005, 'FIPS'] = 15009

    # fix Oglala Lakota County, SD and Bedford, VA
    y2000_2010_df.loc[y2000_2010_df['FIPS'] == 51515, 'FIPS'] = 51019
    y2000_2010_df.loc[y2000_2010_df['FIPS'] == 46113, 'FIPS'] = 46102

    # group and sum our population columns of interest after fixing FIPS issues
    y2000_2010_df = y2000_2010_df.groupby('FIPS').sum()[['POPESTIMATE2004', 'POPESTIMATE2008']]
    y2010_2018_df = y2010_2018_df.groupby('FIPS').sum()[['POPESTIMATE2012', 'POPESTIMATE2016']]

    merged = pd.merge(y2000_2010_df, y2010_2018_df, on='FIPS').reset_index()
    merged.columns = ['FIPS', '2004_pop_count', '2008_pop_count', '2012_pop_count', '2016_pop_count']

    return merged

## format_voter_turnout.py
import pandas as pd


def build_turnout_df():

    df_1980_2014 = pd.read_csv('voter_turnout_data/1980_to_2014.csv')
    df_2016 = pd.read_csv('voter_turnout_data/2016.csv').reset_index()

    df_1980_2014 = df_1980_2014[df_1980_2014.Year.isin([2004, 2008, 2012])]
    df_1980_2014 = df_1980_2014[['Year', 'Unnamed: 3', 'VEP Highest Office']]
    df_1980_2014.columns = ['year', 'name', 'voter_turnout_pct']

    df_2016['Year'] = 2016
    df_2016 = df_2016[['Year', 'index', 'VEP Highest Office']]
    df_2016.columns = ['year', 'name', 'voter_turnout_pct']

    df = pd.concat([df_1980_2014, df_2016])

    # remove all US results and note in 2016 data
    df = df[~(df.name == 'United States')]
    df = df[~df.voter_turnout_pct.isnull()]

    # add proper column name to year column before pivoting
    df['year'] = df['year'].astype(str) + '_voter_turnout'

    # convert voter_turnout_pct to float
    df.voter_turnout_pct = df.voter_turnout_pct.str.rstrip('%').astype(float)

    return df.pivot_table(index='name', columns='year', values='voter_turnout_pct').reset_index()


## qc_checks.py
import pandas as pd


def qc_county_results():

    print("QCing county output")

    df = pd.read_csv('historical_election_data_by_county.csv')

    # confirm that all our percentages add to ~100
    df['2004_total_pct'] = df['2004_dem_pct'] + df['2004_rep_pct'] + df['2004_oth_pct']
    df['2008_total_pct'] = df['2008_dem_pct'] + df['2008_rep_pct'] + df['2008_oth_pct']
    df['2012_total_pct'] = df['2012_dem_pct'] + df['2012_rep_pct'] + df['2012_oth_pct']
    df['2016_total_pct'] = df['2016_dem_pct'] + df['2016_rep_pct'] + df['2016_oth_pct']

    assert df[( df['2004_total_pct'] <= 99.8 ) | ( df['2004_total_pct'] >= 100.2 ) ].empty
    assert df[( df['2008_total_pct'] <= 99.8 ) | ( df['2008_total_pct'] >= 100.2 ) ].empty
    assert df[( df['2012_total_pct'] <= 99.8 ) | ( df['2012_total_pct'] >= 100.2 ) ].empty
    assert df[( df['2016_total_pct'] <= 99.8 ) | ( df['2016_total_pct'] >= 100.2 ) ].empty

    # expecting the normal 3142 US / state counties
    # minus 29 AK counties
    # minus 1 tiny HI county
    # plus 1 record for all of AK
    assert df.shape[0] == 3113

    # add votes by county compare to total votes in source df
    source_df = pd.read_csv('countypres_2000-2016.csv')
    source_df = source_df[source_df.year != 2000]

    # group to get only total votes, year and FIPS (for some reason total votes in this normalized form)
    source_df_g = source_df.groupby(['year', 'FIPS', 'totalvotes']).size().reset_index()

    # pivot to match our data format
    pivoted = source_df_g.pivot_table(index=['FIPS'], columns='year', values='totalvotes')
    pivoted.columns = ['2004_qc_total', '2008_qc_total', '2012_qc_total', '2016_qc_total']

    merged = df.join(pivoted, on='FIPS')
    merged['2004_new_total'] = merged['2004_dem_count'] + merged['2004_rep_count'] + merged['2004_oth_count']
    merged['2008_new_total'] = merged['2008_dem_count'] + merged['2008_rep_count'] + merged['2008_oth_count']
    merged['2012_new_total'] = merged['2012_dem_count'] + merged['2012_rep_count'] + merged['2012_oth_count']
    merged['2016_new_total'] = merged['2016_dem_count'] + merged['2016_rep_count'] + merged['2016_oth_count']

    # we expect two mismatches here due to county combinations- moving Kansas City MO into Jackson MO,
    # and the city of Bedford VA into the county of Bedford
    assert merged[(merged['2004_new_total'] != merged['2004_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2
    assert merged[(merged['2008_new_total'] != merged['2008_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2
    assert merged[(merged['2012_new_total'] != merged['2012_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2

    # 2016 total votes != the sum of candidate votes for Gila AZ, Graham AZ, Greenlee AZ and Keya Paha NE
    # Not immediately clear why this is; we'll stick with using the candidate votes to display our results
    # we're looking for 5 mismatches here - Bedford VA was merged back into the county in 2016, so it's
    # just the 4 mismatches, plus Kansas City --> Jackson MO
    assert merged[(merged['2016_new_total'] != merged['2016_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 5

    # we'll have two rows we can't join to our source data- combined AK, and the Oglala Lakota County in SD,
    # which has a new / changed FIPS code
    assert merged[
                (merged['2004_qc_total'].isnull()) |
                (merged['2008_qc_total'].isnull()) |
                (merged['2012_qc_total'].isnull()) |
                (merged['2016_qc_total'].isnull())
            ].shape[0] == 2

    # finally, check counties we dropped from the source data
    # should be 41 AK districts, plus Kansas City, Bedford VA and Oglala Lakota
    pivoted = pivoted.reset_index()
    joined_back_to_source = pd.merge(pivoted, df, on='FIPS', how='left')

    # look for anything in the AK FIPS code range. AK state FIPS is 02, which gets truncated to 2,
    # with the remaining 3 digits for county / district FIPS
    assert joined_back_to_source[
                (joined_back_to_source.census_name.isnull()) &
                (joined_back_to_source.FIPS > 2000) &
                (joined_back_to_source.FIPS < 3000)
            ].shape[0] == 41

    # KC with that weird 36000 FIPS code
    # Oglala Lakota county FIPS change
    # Bedford VA merging back to the larger surrounding county
    expected_missing_fips = [36000, 46113, 51515]
    assert joined_back_to_source[
                (joined_back_to_source.census_name.isnull()) &
                (joined_back_to_source.FIPS.isin(expected_missing_fips) )
            ].shape[0] == 3


## write_final_table.py
import pandas as pd

import format_election_results
from format_population_data import build_pop_df
from format_voter_turnout import build_turnout_df
from qc_checks import qc_county_results


county_source_df = format_election_results.clean_source_data()
county_results_df = format_election_results.build_results_df(county_source_df)
county_pop_df = build_pop_df()
county_land_area_df = format_election_results.build_land_area_df()

# join to population data and write to CSV
final_county_df = pd.merge(county_results_df, county_pop_df, on='FIPS') \
                    .merge(county_land_area_df, on='FIPS')
format_election_results.write_csv(final_county_df, 'historical_election_data_by_county.csv')

# start building out state results up from county data
state_source_df = county_source_df.copy()
state_pop_df = county_pop_df.copy()
state_land_area_df = county_land_area_df.copy()

# build state FIPS codes by zfilling integer values to 5, then taking first two digits
state_source_df.FIPS = state_source_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)
state_pop_df.FIPS = state_pop_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)
state_land_area_df.FIPS = state_land_area_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)

# group by state, summing county population for our target years
state_pop_df = state_pop_df.groupby('FIPS').sum()[state_pop_df.columns[1:]]
state_land_area_df = state_land_area_df.groupby('FIPS').sum()['land_area_km2'].reset_index()

# use state for the name instead of county
state_source_df.name = state_source_df.state

# group and sum results, assign winners
state_results_df = format_election_results.build_results_df(state_source_df)

# combine with population data
state_with_pop_and_land = pd.merge(state_results_df, state_pop_df, on='FIPS') \
                            .merge(state_land_area_df, on='FIPS')

# add voter turnout data
voter_turnout_df = build_turnout_df()
final_state_df = pd.merge(state_with_pop_and_land, voter_turnout_df, on='name')

format_election_results.write_csv(final_state_df, 'historical_election_data_by_state.csv', is_state=True)

# run QC on the county table
qc_county_results()
	import pandas as pd
	import geopandas as gpd


	def clean_source_data():

	df = pd.read_csv('countypres_2000-2016.csv')

	# remove year 2000 election results
	df = df[df.year != 2000]

	# rename state_po --> state_abbrev, and county --> name
	# (better to standardize for county vs. state tilesets)
	df = df.rename(columns={'state_po': 'state_abbrev', 'county': 'name'})

	# remove any rows that don't have a FIPS code- three such entities:
	# 'Statewide writein' (CT) 'Maine UOCAVA' (ME), 'Federal Precinct' (RI)
	# total of 78,759 votes across 2004 - 2016 elections
	df = df[~df.FIPS.isnull()]

	# make sure we read FIPS as an integer- not float
	df['FIPS'] = df.FIPS.astype(int)

	# correct the FIPS code for Oglala Lakota County, SD
	# should be 46102, not 46113
	df.loc[df['FIPS'] == 46113, 'FIPS'] = 46102

	# fix Bedford, VA issue- it was an independent city (with FIPS 51515)
	# from 2004 - 2012; merged back in to county 51019 in 2016
	# as our shapes are from 2016, we'll tabulate everything in 51019
	df.loc[df['FIPS'] == 51515, 'FIPS'] = 51019

	# the city of Kansas City, MO partially covers multiple counties, with the city
	# itself reporting election results separate from any of the counties it touches.
	# it appears that the NYT adds the Kansas City results to Jackson County MO, the
	# county it covers most thoroughly. We'll do that as well.
	# (Change weird 'Kansas City' FIPS to Jackson County MO FIPS
	df.loc[df['FIPS'] == 36000, 'FIPS'] = 29095
	df.loc[df['FIPS'] == 29095, 'name'] = 'Jackson'

	# Alaska reports their election results by State Rep district rather than by county
	# Given redistricting after the 2012 election, these shapes aren't consistent for
	# our time period. We'll report AK results only at the state level, just as the NYT does
	df.loc[df['state'] == 'Alaska', 'FIPS'] = 2000
	df.loc[df['state'] == 'Alaska', 'name'] = 'Alaska'

	# Alaska rows for District 99 (overseas / abroad) have a null value
	# for state_abbrev- fix this as this is something we want in the final output
	df.loc[df['state'] == 'Alaska', 'state_abbrev'] = 'AK'

	# clean up / abbreviate party names
	df.loc[df['party'].isnull(), 'party'] = 'other'
	df['party'] = df.party.str[0:3]

	# set all candidates votes to 0 if previously null
	df.loc[df['candidatevotes'].isnull(), 'candidatevotes'] = 0

	return df.copy()


	def build_land_area_df():

	census_county_src = gpd.read_file('tiger_source/tl_2019_us_county.shp')
	census_county_src['FIPS'] = census_county_src.GEOID.astype(int)

	# remove all territories / Puerto Rico, etc
	census_county_src = census_county_src[census_county_src.FIPS < 57000]

	# treat all of AK as the same FIPS code, merge Kalawao County, HI
	census_county_src.loc[census_county_src['STATEFP'] == '02', 'FIPS'] = 2000
	census_county_src.loc[census_county_src['FIPS'] == 15005, 'FIPS'] = 15009

	# redenominate land_area from m2 --> km2
	census_county_src.ALAND = census_county_src.ALAND / 1e6

	# group and sum to combine AK data into one
	census_county_src = census_county_src.groupby('FIPS').sum()['ALAND'].reset_index()
	census_county_src.columns = ['FIPS', 'land_area_km2']

	return census_county_src


	def build_results_df(df):

	df = df.copy()

	# turn year into a string so we can use it in our field names
	df['year'] = df.year.astype('str')

	df = df.groupby(['FIPS', 'name', 'state_abbrev', 'year', 'party']) \
	.sum()['candidatevotes'].reset_index()

	# find the winning vote total for each FIPS / year combination
	max_votes = df.groupby(['FIPS', 'year']).max()['candidatevotes'].reset_index()
	max_votes.columns = ['FIPS', 'year', 'max_votes']

	# join max_votes for each year back to our main DF
	# then use this to identify the winner for each FIPS / year
	df = pd.merge(df, max_votes, on=['FIPS', 'year'])
	df.loc[df.candidatevotes == df.max_votes, 'winner'] = df['party']

	# store winning party as a numeric value so we can properly group it
	party_dict = {'dem': 0, 'oth': 1, 'rep': 2}
	df['winner_numeric'] = df['winner'].map(party_dict)

	# find second place votes to tabulate margin of victory
	second_highest_votes = df[df.winner.isnull()].groupby(['FIPS', 'year']) \
	.max()['candidatevotes'].reset_index()
	second_highest_votes.columns = ['FIPS', 'year', 'second_highest_votes']
	df = pd.merge(df, second_highest_votes, on=['FIPS', 'year'])

	# save this summary df of party winners, max votes
	# and runner up votes to ultimately join back to
	summary_df = df[~df.winner.isnull()].copy()
	summary_df = summary_df[['FIPS', 'year', 'max_votes', 'winner_numeric', 'second_highest_votes']]

	# rename the 'party' columns to be {party}_count to help with column naming later
	df['party'] = df['party'] + '_count'

	# start by grouping by FIPS / year to tabulate winners / percentages
	by_year = df.pivot_table(index=['FIPS', 'year'], columns=['party'], values='candidatevotes') \
	.reset_index()

	# total all votes for each election
	by_year['total_votes'] = by_year['dem_count'] + by_year['rep_count'] + by_year['oth_count']

	# join back to summary stats
	by_year = pd.merge(by_year, summary_df, on=['FIPS', 'year'])

	# margin of victory based on https://ballotpedia.org/Margin-of-victory_(MOV)
	by_year['mov_count'] = by_year.max_votes - by_year.second_highest_votes
	by_year['mov_pct'] = (( by_year.max_votes / by_year.total_votes ) - \
	( by_year.second_highest_votes / by_year.total_votes )) * 100

	# calculate percentages for each party
	by_year['dem_pct'] = (by_year.dem_count / by_year.total_votes) * 100
	by_year['rep_pct'] = (by_year.rep_count / by_year.total_votes) * 100
	by_year['oth_pct'] = (by_year.oth_count / by_year.total_votes) * 100

	# pivot our data to our desired format- group by FIPS and year with many result columns
	value_list = ['dem_count', 'rep_count', 'oth_count',
	'dem_pct', 'rep_pct', 'oth_pct', 'total_votes',
	'winner_numeric', 'mov_count', 'mov_pct']
	grouped = by_year.pivot_table(index=['FIPS'], columns=['year'], values=value_list)

	# flatten our index to generate reasonable column names (2004_dem, 2004_rep, etc)
	grouped.columns = ['_'.join([col[-1], col[0]]) for col in grouped.columns.values]

	# map back to our actual party labels
	party_dict_inverse = {v: k for k, v in party_dict.items()}
	grouped['2004_winner'] = grouped['2004_winner_numeric'].map(party_dict_inverse)
	grouped['2008_winner'] = grouped['2008_winner_numeric'].map(party_dict_inverse)
	grouped['2012_winner'] = grouped['2012_winner_numeric'].map(party_dict_inverse)
	grouped['2016_winner'] = grouped['2016_winner_numeric'].map(party_dict_inverse)

	# add back in county_name and state
	name_df = df.groupby(['FIPS', 'name', 'state_abbrev']).size().reset_index()
	del name_df[0]

	grouped = grouped.merge(name_df, on='FIPS')

	return grouped


	def write_csv(df, csv_name, is_state=False):

	# rename to avoid QGIS conflict
	df = df.rename(columns={'name': 'census_name'})

	field_list = ['FIPS', 'census_name', 'state_abbrev', 'land_area_km2']

	for year in range(2004, 2020, 4):
	template_list = ['{}_winner', '{}_mov_count', '{}_mov_pct',
	'{}_dem_count', '{}_dem_pct', '{}_rep_count',
	'{}_rep_pct', '{}_oth_count', '{}_oth_pct',
	'{}_total_votes', '{}_pop_count']

	if is_state:
	template_list += ['{}_voter_turnout']

	field_list += [x.format(year) for x in template_list]

	df = df[field_list]

	df.to_csv(csv_name, header=True, index=False, float_format='%.1f')
	import pandas as pd


	def build_pop_df():

	# pull intercensal estimates from here:
	# https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/county/co-est00int-tot.csv
	y2000_2010_df = pd.read_csv('census_pop/co-est00int-tot.csv', encoding='latin')

	# and current estimates from here:
	# https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/totals/co-est2018-alldata.csv
	y2010_2018_df = pd.read_csv('census_pop/co-est2018-alldata.csv', encoding='latin')

	# remove all state level entries from these tables
	y2000_2010_df = y2000_2010_df[y2000_2010_df.COUNTY != 0]
	y2010_2018_df = y2010_2018_df[y2010_2018_df.COUNTY != 0]

	# calculate FIPS, zfilling as necessary
	y2000_2010_df['FIPS'] = (y2000_2010_df.STATE.astype(str).str.zfill(2) +
	y2000_2010_df.COUNTY.astype(str).str.zfill(3)).astype(int)

	y2010_2018_df['FIPS'] = (y2010_2018_df.STATE.astype(str).str.zfill(2) +
	y2010_2018_df.COUNTY.astype(str).str.zfill(3)).astype(int)

	# treat AK as one FIPS code
	y2000_2010_df.loc[y2000_2010_df['STATE'] == 2, 'FIPS'] = 2000
	y2010_2018_df.loc[y2010_2018_df['STATE'] == 2, 'FIPS'] = 2000

	# merge Kalawao County with Maui county- election data only has Maui
	y2000_2010_df.loc[y2000_2010_df['FIPS'] == 15005, 'FIPS'] = 15009
	y2010_2018_df.loc[y2010_2018_df['FIPS'] == 15005, 'FIPS'] = 15009

	# fix Oglala Lakota County, SD and Bedford, VA
	y2000_2010_df.loc[y2000_2010_df['FIPS'] == 51515, 'FIPS'] = 51019
	y2000_2010_df.loc[y2000_2010_df['FIPS'] == 46113, 'FIPS'] = 46102

	# group and sum our population columns of interest after fixing FIPS issues
	y2000_2010_df = y2000_2010_df.groupby('FIPS').sum()[['POPESTIMATE2004', 'POPESTIMATE2008']]
	y2010_2018_df = y2010_2018_df.groupby('FIPS').sum()[['POPESTIMATE2012', 'POPESTIMATE2016']]

	merged = pd.merge(y2000_2010_df, y2010_2018_df, on='FIPS').reset_index()
	merged.columns = ['FIPS', '2004_pop_count', '2008_pop_count', '2012_pop_count', '2016_pop_count']

	return merged
	import pandas as pd


	def build_turnout_df():

	df_1980_2014 = pd.read_csv('voter_turnout_data/1980_to_2014.csv')
	df_2016 = pd.read_csv('voter_turnout_data/2016.csv').reset_index()

	df_1980_2014 = df_1980_2014[df_1980_2014.Year.isin([2004, 2008, 2012])]
	df_1980_2014 = df_1980_2014[['Year', 'Unnamed: 3', 'VEP Highest Office']]
	df_1980_2014.columns = ['year', 'name', 'voter_turnout_pct']

	df_2016['Year'] = 2016
	df_2016 = df_2016[['Year', 'index', 'VEP Highest Office']]
	df_2016.columns = ['year', 'name', 'voter_turnout_pct']

	df = pd.concat([df_1980_2014, df_2016])

	# remove all US results and note in 2016 data
	df = df[~(df.name == 'United States')]
	df = df[~df.voter_turnout_pct.isnull()]

	# add proper column name to year column before pivoting
	df['year'] = df['year'].astype(str) + '_voter_turnout'

	# convert voter_turnout_pct to float
	df.voter_turnout_pct = df.voter_turnout_pct.str.rstrip('%').astype(float)

	return df.pivot_table(index='name', columns='year', values='voter_turnout_pct').reset_index()
	import pandas as pd


	def qc_county_results():

	print("QCing county output")

	df = pd.read_csv('historical_election_data_by_county.csv')

	# confirm that all our percentages add to ~100
	df['2004_total_pct'] = df['2004_dem_pct'] + df['2004_rep_pct'] + df['2004_oth_pct']
	df['2008_total_pct'] = df['2008_dem_pct'] + df['2008_rep_pct'] + df['2008_oth_pct']
	df['2012_total_pct'] = df['2012_dem_pct'] + df['2012_rep_pct'] + df['2012_oth_pct']
	df['2016_total_pct'] = df['2016_dem_pct'] + df['2016_rep_pct'] + df['2016_oth_pct']

	assert df[( df['2004_total_pct'] <= 99.8 ) \| ( df['2004_total_pct'] >= 100.2 ) ].empty
	assert df[( df['2008_total_pct'] <= 99.8 ) \| ( df['2008_total_pct'] >= 100.2 ) ].empty
	assert df[( df['2012_total_pct'] <= 99.8 ) \| ( df['2012_total_pct'] >= 100.2 ) ].empty
	assert df[( df['2016_total_pct'] <= 99.8 ) \| ( df['2016_total_pct'] >= 100.2 ) ].empty

	# expecting the normal 3142 US / state counties
	# minus 29 AK counties
	# minus 1 tiny HI county
	# plus 1 record for all of AK
	assert df.shape[0] == 3113

	# add votes by county compare to total votes in source df
	source_df = pd.read_csv('countypres_2000-2016.csv')
	source_df = source_df[source_df.year != 2000]

	# group to get only total votes, year and FIPS (for some reason total votes in this normalized form)
	source_df_g = source_df.groupby(['year', 'FIPS', 'totalvotes']).size().reset_index()

	# pivot to match our data format
	pivoted = source_df_g.pivot_table(index=['FIPS'], columns='year', values='totalvotes')
	pivoted.columns = ['2004_qc_total', '2008_qc_total', '2012_qc_total', '2016_qc_total']

	merged = df.join(pivoted, on='FIPS')
	merged['2004_new_total'] = merged['2004_dem_count'] + merged['2004_rep_count'] + merged['2004_oth_count']
	merged['2008_new_total'] = merged['2008_dem_count'] + merged['2008_rep_count'] + merged['2008_oth_count']
	merged['2012_new_total'] = merged['2012_dem_count'] + merged['2012_rep_count'] + merged['2012_oth_count']
	merged['2016_new_total'] = merged['2016_dem_count'] + merged['2016_rep_count'] + merged['2016_oth_count']

	# we expect two mismatches here due to county combinations- moving Kansas City MO into Jackson MO,
	# and the city of Bedford VA into the county of Bedford
	assert merged[(merged['2004_new_total'] != merged['2004_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2
	assert merged[(merged['2008_new_total'] != merged['2008_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2
	assert merged[(merged['2012_new_total'] != merged['2012_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 2

	# 2016 total votes != the sum of candidate votes for Gila AZ, Graham AZ, Greenlee AZ and Keya Paha NE
	# Not immediately clear why this is; we'll stick with using the candidate votes to display our results
	# we're looking for 5 mismatches here - Bedford VA was merged back into the county in 2016, so it's
	# just the 4 mismatches, plus Kansas City --> Jackson MO
	assert merged[(merged['2016_new_total'] != merged['2016_qc_total']) & (~merged['2004_qc_total'].isnull())].shape[0] == 5

	# we'll have two rows we can't join to our source data- combined AK, and the Oglala Lakota County in SD,
	# which has a new / changed FIPS code
	assert merged[
	(merged['2004_qc_total'].isnull()) \|
	(merged['2008_qc_total'].isnull()) \|
	(merged['2012_qc_total'].isnull()) \|
	(merged['2016_qc_total'].isnull())
	].shape[0] == 2

	# finally, check counties we dropped from the source data
	# should be 41 AK districts, plus Kansas City, Bedford VA and Oglala Lakota
	pivoted = pivoted.reset_index()
	joined_back_to_source = pd.merge(pivoted, df, on='FIPS', how='left')

	# look for anything in the AK FIPS code range. AK state FIPS is 02, which gets truncated to 2,
	# with the remaining 3 digits for county / district FIPS
	assert joined_back_to_source[
	(joined_back_to_source.census_name.isnull()) &
	(joined_back_to_source.FIPS > 2000) &
	(joined_back_to_source.FIPS < 3000)
	].shape[0] == 41

	# KC with that weird 36000 FIPS code
	# Oglala Lakota county FIPS change
	# Bedford VA merging back to the larger surrounding county
	expected_missing_fips = [36000, 46113, 51515]
	assert joined_back_to_source[
	(joined_back_to_source.census_name.isnull()) &
	(joined_back_to_source.FIPS.isin(expected_missing_fips) )
	].shape[0] == 3
	import pandas as pd

	import format_election_results
	from format_population_data import build_pop_df
	from format_voter_turnout import build_turnout_df
	from qc_checks import qc_county_results


	county_source_df = format_election_results.clean_source_data()
	county_results_df = format_election_results.build_results_df(county_source_df)
	county_pop_df = build_pop_df()
	county_land_area_df = format_election_results.build_land_area_df()

	# join to population data and write to CSV
	final_county_df = pd.merge(county_results_df, county_pop_df, on='FIPS') \
	.merge(county_land_area_df, on='FIPS')
	format_election_results.write_csv(final_county_df, 'historical_election_data_by_county.csv')

	# start building out state results up from county data
	state_source_df = county_source_df.copy()
	state_pop_df = county_pop_df.copy()
	state_land_area_df = county_land_area_df.copy()

	# build state FIPS codes by zfilling integer values to 5, then taking first two digits
	state_source_df.FIPS = state_source_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)
	state_pop_df.FIPS = state_pop_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)
	state_land_area_df.FIPS = state_land_area_df.FIPS.astype(str).str.zfill(5).str[0:2].astype(int)

	# group by state, summing county population for our target years
	state_pop_df = state_pop_df.groupby('FIPS').sum()[state_pop_df.columns[1:]]
	state_land_area_df = state_land_area_df.groupby('FIPS').sum()['land_area_km2'].reset_index()

	# use state for the name instead of county
	state_source_df.name = state_source_df.state

	# group and sum results, assign winners
	state_results_df = format_election_results.build_results_df(state_source_df)

	# combine with population data
	state_with_pop_and_land = pd.merge(state_results_df, state_pop_df, on='FIPS') \
	.merge(state_land_area_df, on='FIPS')

	# add voter turnout data
	voter_turnout_df = build_turnout_df()
	final_state_df = pd.merge(state_with_pop_and_land, voter_turnout_df, on='name')

	format_election_results.write_csv(final_state_df, 'historical_election_data_by_state.csv', is_state=True)

	# run QC on the county table
	qc_county_results()