benzipperer/clean_qcew.R

## clean_qcew.R
library(tidyverse)
library(haven)
library(data.table)


clean_qcew_annual <- function(x) {
  # QCEW data from https://www.bls.gov/cew/downloadable-data-files.htm
  fread(cmd = paste0("unzip -p ", x, "_annual_singlefile.zip")) %>%
    # keep only county-level private sector
    filter(agglvl_code == 71 & own_code == 5) %>%
    # drop PR and VI
    filter(as.numeric(str_sub(area_fips, 1, 2)) < 72) %>%
    transmute(
      fips = as.numeric(area_fips),
      year,
      qcew_pay_avg = avg_annual_pay,
      qcew_emp_avg = annual_avg_emplvl
    )
}

map_dfr(2013:2019, clean_qcew_annual) %>%
  write_dta("qcew_2013_2019.dta")

## cpi.csv

          
            year
            cpi_u_x1
            cpi_u
            cpi_u_core
            cpiurs
            cpiurs_core

            
              1947
              24.200001
              22.299999
              
              37.4

            
              1948
              26.200001
              24
              
              40.5

            
              1949
              25.9
              23.799999
              
              40.1

            
              1950
              26.200001
              24.1
              
              40.5

            
              1951
              28.299999
              26
              
              43.8

            
              1952
              28.799999
              26.6
              
              44.5

            
              1953
              29
              26.799999
              
              44.90000000000001

            
              1954
              29.200001
              26.799999
              
              45.2

            
              1955
              29.1
              26.799999
              
              45

            
              1956
              29.6
              27.200001
              
              45.8

            
              1957
              30.5
              28.1
              28.9
              47.2

            
              1958
              31.4
              28.9
              29.6
              48.6

            
              1959
              31.6
              29.1
              30.2
              48.90000000000001

            
              1960
              32.200001
              29.6
              30.6
              49.8

            
              1961
              32.5
              29.9
              31
              50.3

            
              1962
              32.799999
              30.200001
              31.4
              50.7

            
              1963
              33.299999
              30.6
              31.8
              51.5

            
              1964
              33.700001
              31
              32.3
              52.1

            
              1965
              34.200001
              31.5
              32.7
              52.90000000000001

            
              1966
              35.200001
              32.5
              33.5
              54.40000000000001

            
              1967
              36.299999
              33.400002
              34.7
              56.1

            
              1968
              37.700001
              34.799999
              36.3
              58.3

            
              1969
              39.400002
              36.700001
              38.4
              60.90000000000001

            
              1970
              41.299999
              38.799999
              40.8
              63.90000000000001

            
              1971
              43.099998
              40.5
              42.7
              66.7

            
              1972
              44.400002
              41.799999
              44
              68.7

            
              1973
              47.200001
              44.400002
              45.6
              73

            
              1974
              51.900002
              49.299999
              49.3
              80.30000000000001

            
              1975
              56.200001
              53.799999
              53.9
              86.90000000000001

            
              1976
              59.400002
              56.900002
              57.4
              91.90000000000001

            
              1977
              63.200001
              60.599998
              61
              97.7

            
              1978
              67.5
              65.199997
              65.5
              104.4
              103.6

            
              1979
              74
              72.599998
              71.90000000000001
              114.3
              111

            
              1980
              82.300003
              82.400002
              80.8
              127.1
              120.9

            
              1981
              90.099998
              90.900002
              89.2
              139.1
              132.2

            
              1982
              95.599998
              96.5
              95.8
              147.5
              142.4

            
              1983
              
              99.599998
              99.59999999999999
              153.8
              150.4

            
              1984
              
              103.9
              104.5
              160.2
              157.9

            
              1985
              
              107.6
              109.1
              165.7
              164.8

            
              1986
              
              109.6
              113.5
              168.6
              171.4

            
              1987
              
              113.6
              118.2
              174.4
              178.1

            
              1988
              
              118.3
              123.4
              180.7
              185.2

            
              1989
              
              124
              129
              188.6
              192.6

            
              1990
              
              130.7
              135.5
              197.9
              201.4

            
              1991
              
              136.2
              142.1
              205.1
              209.9

            
              1992
              
              140.3
              147.3
              210.2
              216.4

            
              1993
              
              144.5
              152.2
              215.5
              222.5

            
              1994
              
              148.2
              156.5
              220
              227.7

            
              1995
              
              152.39999
              161.2
              225.3
              233.4

            
              1996
              
              156.8
              165.6
              231.3
              239.1

            
              1997
              
              160.5
              169.5
              236.3
              244.4

            
              1998
              
              163
              173.4
              239.5
              249.6

            
              1999
              
              166.60001
              177
              244.6
              254.6

            
              2000
              
              172.2
              181.3
              252.9
              260.9

            
              2001
              
              177.10001
              186.1
              260.1
              267.9

            
              2002
              
              179.89999
              190.4
              264.2
              274.1

            
              2003
              
              184
              193.2
              270.2
              278.1

            
              2004
              
              188.89999
              196.6
              277.5
              283.1

            
              2005
              
              195.3
              200.9
              286.9
              289.2

            
              2006
              
              201.60001
              205.9
              296.2
              296.5

            
              2007
              
              207.3
              210.7
              304.6
              303.4

            
              2008
              
              215.3
              215.6
              316.3
              310.3

            
              2009
              
              214.5
              219.2
              315.2
              315.6

            
              2010
              
              218.10001
              221.3
              320.4
              318.7

            
              2011
              
              224.89999
              225
              330.5
              324

            
              2012
              
              229.60001
              229.8
              337.5
              331

            
              2013
              
              233
              233.8
              342.5
              337

            
              2014
              
              236.7
              237.9
              348.3
              343.1

            
              2015
              
              237
              242.2
              348.9
              349.7

            
              2016
              
              240
              247.6
              353.4
              357.5

            
              2017
              
              245.10001
              252.2
              361
              364.2

            
              2018
              
              251.10001
              257.6
              369.8
              372

            
              2019
              
              255.7
              263.2
              376.5
              380.2

## master.do
set more off
clear all

* cpi data
import delimited cpi.csv, clear
keep year cpiurs
keep if cpiurs != .
sum cpiurs if year == 2019
local cpiurs2019 = r(mean)
tempfile cpiurs
save `cpiurs'

* clean QCEW data
global Rterm_path "/usr/bin/R"
* use R script to convert raw QCEW to Stata format,
* because Stata is maddeningly slow
* rsource using clean_qcew.R, roptions(--no-save)

* deal with some QCEW county issues
use qcew_2013_2019.dta, clear
tostring fips, gen(fips_string) format(%05.0f)

* problem: won't be able to match "Unknown Or Undefined" QCEW counties to electoral returns
* solution: drop them from QCEW data
drop if substr(fips_string, 3, 5) == "999"

* problem:  Bedford city (51515) added to Bedford County (FIPS 51019) in 2013
* solution: drop entries for Bedford City in 2013
drop if fips == 51515 & year == 2013

* problem: mid-year Shannon County (46113) renamed to Oglala Lakota County (46102) in mid-2015
* solution: drop the QCEW entries for 2015 Shannon, keep 2015 Oglala
*           then rename Oglala to Shannon to match electoral returns
drop if fips == 46113 & year == 2015
replace fips = 46113 if fips == 46102

* problem: can't match many Alaska returns and QCEW data
* solution: drop Alaska
drop if substr(fips_string, 1, 2) == "02"

drop fips_string
tempfile qcewdata
save `qcewdata'

* clean electoral data
* from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ
import delimited countypres_2000-2016.csv, clear
replace candidatevotes = "" if candidatevotes == "NA"
destring candidatevotes, replace
replace fips = "" if fips == "NA"
destring fips, replace
encode candidate, gen(candidate_code)
rename state_po state_abb
rename county county_name
rename totalvotes total_votes
rename candidatevotes candidate_votes
keep year state_abb fips candidate_code candidate_votes total_votes county_name

* deal with electoral county issues
* remove entries with no fips
drop if fips == .

* problem:  Kansas city entry in electoral (FIPS 36000) not a county in QCEW
*           weird that it has FIPS of 36000 when 36 is the FIPS code for NY
* solution: going to drop it because there is no clean way to allocate
*           Kansas City votes to other counties
drop if fips == 36000

* deal with mismatches
tostring fips, gen(fips_string) format(%05.0f)
* problem: can't match many Alaska returns and QCEW data
* solution: drop Alaska
drop if substr(fips_string, 1, 2) == "02"
drop fips_string

* problem:  Bedford city (51515) added to Bedford County (51019) in 2013
*           but returns have both
* solution: add all of Bedford city returns to Bedford County
gen new_fips = fips
replace new_fips = 51019 if fips == 51515
assert county_name == "Bedford" if new_fips == 51019
collapse (sum) candidate_votes total_votes (first) state_abb county_name, by(new_fips year candidate_code)
rename new_fips fips

* create Obama => Obama => Trump indicator
keep if year >= 2008
gen vote_share = candidate_votes/total_votes
egen rank = rank(vote_share), field by(year fips)
* check to make sure there is really only one winner in year X fips
duplicates report year fips if rank == 1
assert r(unique_value) == r(N)
* determine who won county
gen obama = candidate_code == 2 & rank == 1
gen trump = candidate_code == 3 & rank == 1
gen mccain = candidate_code == 7 & rank == 1
gen romney = candidate_code == 8 & rank == 1
gen clinton = candidate_code == 5 & rank == 1
collapse (max) obama trump mccain romney clinton (first) county_name total_votes, by(year state_abb fips)

* identify swing counties
reshape wide obama trump mccain romney clinton total_votes, i(fips) j(year)
gen pivot_to_trump = obama2008 == 1 & obama2012 == 1 & trump2016 == 1
gen pivot_to_clinton = mccain2008 == 1 & romney2012 == 1 & clinton2016 == 1
tab pivot_to_trump pivot_to_clinton, m
egen mean_total_votes = rowmean(total_votes*)
keep fips county_name state_abb pivot* mean_total_votes trump2016 clinton2016

* merge QCEW data
merge 1:m fips using `qcewdata', assert(3) nogenerate
* confirm this is a balanced panel
tsset fips year
assert r(balanced) == "strongly balanced"

* some counties have no QCEW employment; drop counties with any zeros
tab fips, nofreq
local before_count = r(r)
tab fips if pivot_to_trump == 1, nofreq
local before_count_pivot = r(r)
gen _zeroemp = qcew_emp_avg == 0
egen zeroemp = max(_zeroemp), by(fips)
drop if zeroemp == 1
drop *zeroemp
tab fips, nofreq
local after_count = r(r)
tab fips if pivot_to_trump == 1, nofreq
local after_count_pivot = r(r)
di "Removing zero emp counties dropped"
di `before_count_pivot' - `after_count_pivot' " pivot-to-Trump counties"
di `before_count' - `after_count' " counties overall"

merge m:1 year using `cpiurs', keep(3) nogenerate
gen real_avg_pay = qcew_pay_avg * `cpiurs2019' / cpiurs
rename qcew_pay_avg nom_avg_pay
rename qcew_emp_avg emp
drop cpiurs

keep if year == 2013 | year == 2016 | year == 2019
egen period = group(year)
lab def period 2 "2013 - 2016" 3 "2016 - 2019"
lab val period period
tsset fips period
foreach var of varlist *avg_pay emp {
    gen Dpct_`var' = `var' / L.`var' - 1
}

* analysis of pivot vs others
keep if period > 1
keep fips period Dpct* mean_total_votes pivot_to_trump
reshape wide Dpct_real_avg_pay Dpct_nom_avg_pay Dpct_emp, i(fips) j(period)

foreach x in real_avg_pay nom_avg_pay emp {
    gen faster_`x' = Dpct_`x'3 > Dpct_`x'2
}

table pivot_to_trump, c(mean faster_nom_avg_pay mean faster_real_avg_pay)

collapse (mean) Dpct_real_avg_pay2 Dpct_real_avg_pay3 Dpct_nom_avg_pay2 Dpct_nom_avg_pay3 [aw=mean_total_votes], by(pivot_to_trump)
foreach x of varlist Dpct* {
    gen `x'_string = string(round(`x' * 100, 0.1), "%3.1f") + "%"
    drop `x'
    rename `x'_string `x'
}
lab var Dpct_real_avg_pay2 "2013-2016"
lab var Dpct_real_avg_pay3 "2016-2019"
lab var Dpct_nom_avg_pay2 "2013-2016"
lab var Dpct_nom_avg_pay3 "2016-2019"
lab def pivot_to_trump 0 "Other counties" 1 "Pivot counties"
lab val pivot_to_trump pivot_to_trump

list

keep pivot_to_trump Dpct_real_avg_pay*
export excel using average_growth.xlsx, firstrow(varlabels) replace
	library(tidyverse)
	library(haven)
	library(data.table)


	clean_qcew_annual <- function(x) {
	# QCEW data from https://www.bls.gov/cew/downloadable-data-files.htm
	fread(cmd = paste0("unzip -p ", x, "_annual_singlefile.zip")) %>%
	# keep only county-level private sector
	filter(agglvl_code == 71 & own_code == 5) %>%
	# drop PR and VI
	filter(as.numeric(str_sub(area_fips, 1, 2)) < 72) %>%
	transmute(
	fips = as.numeric(area_fips),
	year,
	qcew_pay_avg = avg_annual_pay,
	qcew_emp_avg = annual_avg_emplvl
	)
	}

	map_dfr(2013:2019, clean_qcew_annual) %>%
	write_dta("qcew_2013_2019.dta")
year	cpi_u_x1	cpi_u	cpi_u_core	cpiurs	cpiurs_core
1947	24.200001	22.299999		37.4
1948	26.200001	24		40.5
1949	25.9	23.799999		40.1
1950	26.200001	24.1		40.5
1951	28.299999	26		43.8
1952	28.799999	26.6		44.5
1953	29	26.799999		44.90000000000001
1954	29.200001	26.799999		45.2
1955	29.1	26.799999		45
1956	29.6	27.200001		45.8
1957	30.5	28.1	28.9	47.2
1958	31.4	28.9	29.6	48.6
1959	31.6	29.1	30.2	48.90000000000001
1960	32.200001	29.6	30.6	49.8
1961	32.5	29.9	31	50.3
1962	32.799999	30.200001	31.4	50.7
1963	33.299999	30.6	31.8	51.5
1964	33.700001	31	32.3	52.1
1965	34.200001	31.5	32.7	52.90000000000001
1966	35.200001	32.5	33.5	54.40000000000001
1967	36.299999	33.400002	34.7	56.1
1968	37.700001	34.799999	36.3	58.3
1969	39.400002	36.700001	38.4	60.90000000000001
1970	41.299999	38.799999	40.8	63.90000000000001
1971	43.099998	40.5	42.7	66.7
1972	44.400002	41.799999	44	68.7
1973	47.200001	44.400002	45.6	73
1974	51.900002	49.299999	49.3	80.30000000000001
1975	56.200001	53.799999	53.9	86.90000000000001
1976	59.400002	56.900002	57.4	91.90000000000001
1977	63.200001	60.599998	61	97.7
1978	67.5	65.199997	65.5	104.4	103.6
1979	74	72.599998	71.90000000000001	114.3	111
1980	82.300003	82.400002	80.8	127.1	120.9
1981	90.099998	90.900002	89.2	139.1	132.2
1982	95.599998	96.5	95.8	147.5	142.4
1983		99.599998	99.59999999999999	153.8	150.4
1984		103.9	104.5	160.2	157.9
1985		107.6	109.1	165.7	164.8
1986		109.6	113.5	168.6	171.4
1987		113.6	118.2	174.4	178.1
1988		118.3	123.4	180.7	185.2
1989		124	129	188.6	192.6
1990		130.7	135.5	197.9	201.4
1991		136.2	142.1	205.1	209.9
1992		140.3	147.3	210.2	216.4
1993		144.5	152.2	215.5	222.5
1994		148.2	156.5	220	227.7
1995		152.39999	161.2	225.3	233.4
1996		156.8	165.6	231.3	239.1
1997		160.5	169.5	236.3	244.4
1998		163	173.4	239.5	249.6
1999		166.60001	177	244.6	254.6
2000		172.2	181.3	252.9	260.9
2001		177.10001	186.1	260.1	267.9
2002		179.89999	190.4	264.2	274.1
2003		184	193.2	270.2	278.1
2004		188.89999	196.6	277.5	283.1
2005		195.3	200.9	286.9	289.2
2006		201.60001	205.9	296.2	296.5
2007		207.3	210.7	304.6	303.4
2008		215.3	215.6	316.3	310.3
2009		214.5	219.2	315.2	315.6
2010		218.10001	221.3	320.4	318.7
2011		224.89999	225	330.5	324
2012		229.60001	229.8	337.5	331
2013		233	233.8	342.5	337
2014		236.7	237.9	348.3	343.1
2015		237	242.2	348.9	349.7
2016		240	247.6	353.4	357.5
2017		245.10001	252.2	361	364.2
2018		251.10001	257.6	369.8	372
2019		255.7	263.2	376.5	380.2
	set more off
	clear all

	* cpi data
	import delimited cpi.csv, clear
	keep year cpiurs
	keep if cpiurs != .
	sum cpiurs if year == 2019
	local cpiurs2019 = r(mean)
	tempfile cpiurs
	save `cpiurs'

	* clean QCEW data
	global Rterm_path "/usr/bin/R"
	* use R script to convert raw QCEW to Stata format,
	* because Stata is maddeningly slow
	* rsource using clean_qcew.R, roptions(--no-save)

	* deal with some QCEW county issues
	use qcew_2013_2019.dta, clear
	tostring fips, gen(fips_string) format(%05.0f)

	* problem: won't be able to match "Unknown Or Undefined" QCEW counties to electoral returns
	* solution: drop them from QCEW data
	drop if substr(fips_string, 3, 5) == "999"

	* problem: Bedford city (51515) added to Bedford County (FIPS 51019) in 2013
	* solution: drop entries for Bedford City in 2013
	drop if fips == 51515 & year == 2013

	* problem: mid-year Shannon County (46113) renamed to Oglala Lakota County (46102) in mid-2015
	* solution: drop the QCEW entries for 2015 Shannon, keep 2015 Oglala
	* then rename Oglala to Shannon to match electoral returns
	drop if fips == 46113 & year == 2015
	replace fips = 46113 if fips == 46102

	* problem: can't match many Alaska returns and QCEW data
	* solution: drop Alaska
	drop if substr(fips_string, 1, 2) == "02"

	drop fips_string
	tempfile qcewdata
	save `qcewdata'

	* clean electoral data
	* from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ
	import delimited countypres_2000-2016.csv, clear
	replace candidatevotes = "" if candidatevotes == "NA"
	destring candidatevotes, replace
	replace fips = "" if fips == "NA"
	destring fips, replace
	encode candidate, gen(candidate_code)
	rename state_po state_abb
	rename county county_name
	rename totalvotes total_votes
	rename candidatevotes candidate_votes
	keep year state_abb fips candidate_code candidate_votes total_votes county_name

	* deal with electoral county issues
	* remove entries with no fips
	drop if fips == .

	* problem: Kansas city entry in electoral (FIPS 36000) not a county in QCEW
	* weird that it has FIPS of 36000 when 36 is the FIPS code for NY
	* solution: going to drop it because there is no clean way to allocate
	* Kansas City votes to other counties
	drop if fips == 36000

	* deal with mismatches
	tostring fips, gen(fips_string) format(%05.0f)
	* problem: can't match many Alaska returns and QCEW data
	* solution: drop Alaska
	drop if substr(fips_string, 1, 2) == "02"
	drop fips_string

	* problem: Bedford city (51515) added to Bedford County (51019) in 2013
	* but returns have both
	* solution: add all of Bedford city returns to Bedford County
	gen new_fips = fips
	replace new_fips = 51019 if fips == 51515
	assert county_name == "Bedford" if new_fips == 51019
	collapse (sum) candidate_votes total_votes (first) state_abb county_name, by(new_fips year candidate_code)
	rename new_fips fips

	* create Obama => Obama => Trump indicator
	keep if year >= 2008
	gen vote_share = candidate_votes/total_votes
	egen rank = rank(vote_share), field by(year fips)
	* check to make sure there is really only one winner in year X fips
	duplicates report year fips if rank == 1
	assert r(unique_value) == r(N)
	* determine who won county
	gen obama = candidate_code == 2 & rank == 1
	gen trump = candidate_code == 3 & rank == 1
	gen mccain = candidate_code == 7 & rank == 1
	gen romney = candidate_code == 8 & rank == 1
	gen clinton = candidate_code == 5 & rank == 1
	collapse (max) obama trump mccain romney clinton (first) county_name total_votes, by(year state_abb fips)

	* identify swing counties
	reshape wide obama trump mccain romney clinton total_votes, i(fips) j(year)
	gen pivot_to_trump = obama2008 == 1 & obama2012 == 1 & trump2016 == 1
	gen pivot_to_clinton = mccain2008 == 1 & romney2012 == 1 & clinton2016 == 1
	tab pivot_to_trump pivot_to_clinton, m
	egen mean_total_votes = rowmean(total_votes*)
	keep fips county_name state_abb pivot* mean_total_votes trump2016 clinton2016

	* merge QCEW data
	merge 1:m fips using `qcewdata', assert(3) nogenerate
	* confirm this is a balanced panel
	tsset fips year
	assert r(balanced) == "strongly balanced"

	* some counties have no QCEW employment; drop counties with any zeros
	tab fips, nofreq
	local before_count = r(r)
	tab fips if pivot_to_trump == 1, nofreq
	local before_count_pivot = r(r)
	gen _zeroemp = qcew_emp_avg == 0
	egen zeroemp = max(_zeroemp), by(fips)
	drop if zeroemp == 1
	drop *zeroemp
	tab fips, nofreq
	local after_count = r(r)
	tab fips if pivot_to_trump == 1, nofreq
	local after_count_pivot = r(r)
	di "Removing zero emp counties dropped"
	di `before_count_pivot' - `after_count_pivot' " pivot-to-Trump counties"
	di `before_count' - `after_count' " counties overall"

	merge m:1 year using `cpiurs', keep(3) nogenerate
	gen real_avg_pay = qcew_pay_avg * `cpiurs2019' / cpiurs
	rename qcew_pay_avg nom_avg_pay
	rename qcew_emp_avg emp
	drop cpiurs

	keep if year == 2013 \| year == 2016 \| year == 2019
	egen period = group(year)
	lab def period 2 "2013 - 2016" 3 "2016 - 2019"
	lab val period period
	tsset fips period
	foreach var of varlist *avg_pay emp {
	gen Dpct_`var' = `var' / L.`var' - 1
	}

	* analysis of pivot vs others
	keep if period > 1
	keep fips period Dpct* mean_total_votes pivot_to_trump
	reshape wide Dpct_real_avg_pay Dpct_nom_avg_pay Dpct_emp, i(fips) j(period)

	foreach x in real_avg_pay nom_avg_pay emp {
	gen faster_`x' = Dpct_`x'3 > Dpct_`x'2
	}

	table pivot_to_trump, c(mean faster_nom_avg_pay mean faster_real_avg_pay)

	collapse (mean) Dpct_real_avg_pay2 Dpct_real_avg_pay3 Dpct_nom_avg_pay2 Dpct_nom_avg_pay3 [aw=mean_total_votes], by(pivot_to_trump)
	foreach x of varlist Dpct* {
	gen `x'_string = string(round(`x' * 100, 0.1), "%3.1f") + "%"
	drop `x'
	rename `x'_string `x'
	}
	lab var Dpct_real_avg_pay2 "2013-2016"
	lab var Dpct_real_avg_pay3 "2016-2019"
	lab var Dpct_nom_avg_pay2 "2013-2016"
	lab var Dpct_nom_avg_pay3 "2016-2019"
	lab def pivot_to_trump 0 "Other counties" 1 "Pivot counties"
	lab val pivot_to_trump pivot_to_trump

	list

	keep pivot_to_trump Dpct_real_avg_pay*
	export excel using average_growth.xlsx, firstrow(varlabels) replace