Skip to content

Instantly share code, notes, and snippets.

@benzipperer
Last active January 27, 2022 02:05
Show Gist options
  • Save benzipperer/aee8bfe40173328354f40bcb6182f825 to your computer and use it in GitHub Desktop.
Save benzipperer/aee8bfe40173328354f40bcb6182f825 to your computer and use it in GitHub Desktop.
demonstrate the effects of Census rounding wages on Atlanta Fed Wage Tracker
library(tidyverse)
library(lubridate)
library(haven)
library(assertr)
library(slider)
library(hrbrthemes)
# function to bin hourly wages of hourly workers
bin_hourly <- function(df, new, old) {
df %>%
mutate({{new}} := case_when(
# hourly workers
# assign missing for "N < 15"
{{old}} < 0.15 & paidhrly82 == 1 ~ NA_real_,
# nearest $0.10
{{old}} >= 0.15 & {{old}} < 1 & paidhrly82 == 1 ~ round({{old}}, 1),
# nearest $0.50
{{old}} >= 1 & {{old}} < 10 & paidhrly82 == 1 ~ round({{old}} * 2, 0) / 2,
# nearest $1
{{old}} >= 10 & {{old}} < 100 & paidhrly82 == 1 ~ round({{old}}, 0),
# nearest $5
{{old}}>= 100 & {{old}} < 1000 & paidhrly82 == 1 ~ round({{old}} * 1/5, 0) * 5,
# nearest $10
{{old}} >= 1000 & {{old}} < 10000 & paidhrly82 == 1 ~ round({{old}} * 1/10, 0) * 10,
# top-coded
{{old}} >= 10000 & paidhrly82 == 1 ~ NA_real_,
))
}
# function to bin weekly wages of non-hourly workers
bin_weekly <- function(df, new, old) {
df %>%
mutate({{new}} := case_when(
# assign missing for "N < 15"
{{old}} < 15 & paidhrly82 == 0 ~ NA_real_,
# nearest $10
{{old}} >= 15 & {{old}} < 100 & paidhrly82 == 0 ~ round({{old}} / 10, 0) * 10,
# nearest $50
{{old}} >= 100 & {{old}} < 1000 & paidhrly82 == 0 ~ round({{old}} / 50, 0) * 50,
# nearest $100
{{old}} >= 1000 & {{old}} < 10000 & paidhrly82 == 0 ~ round({{old}} / 100, 0) * 100,
# don't need to do higher categories since weekly wages seem top-coded at $2884.61
{{old}} >= 10000 & paidhrly82 == 0 ~ NA_real_,
))
}
# get a subset of the raw data
# https://cps.kansascityfed.org/cpsdata/full/CPS_harmonized_variable_longitudinally_matched_age16plus.dta.gz
# WARNING: importing the data this way is slow
wgt_data_raw_subset <- read_dta(
"CPS_harmonized_variable_longitudinally_matched_age16plus.dta.gz",
col_select = c(date, personid, paidhrly82, age76, matches("wage|hrs|hours|perwk"))
) %>%
mutate(year = year(date)) %>%
filter(year >= 1996 & age76 >= 16) %>%
select(year, date, personid, paidhrly82, matches("wage|hrs|hours|perwk")) %>%
# keep only wage earners
filter(!is.na(wageperhrclean82))
# prep microdata and create new binned wage
wgt_data_prepped <- wgt_data_raw_subset %>%
# these appear to be the WGT and underlying wages used by ATL Fed
rename(
wgt_original = wagegrowthtracker83,
hourly_wage = wageperhrclean82,
weekly_wage = wageperwkclean82
) %>%
mutate(wage_old = hourly_wage) %>%
# now check that ATL Fed uses something like the following to construct wage
mutate(wage_new = case_when(
# ensure ATL Fed top-code / allocated restrictions
wageperhrtopcoded89 == 1 | wageperhrallocated89 == 1 ~ NA_real_,
# paid hourly
paidhrly82 == 1 ~ hourly_wage,
# paid weekly
paidhrly82 == 0 & hours82 != 0 ~ weekly_wage / hours82
)) %>%
# censor data like ATL Fed does
mutate(wage_new = if_else(wage_new > 100, 100, wage_new)) %>%
# drop obs in Atl Fed data that don't have a wage
filter(!is.na(wage_old)) %>%
# drop small number (< 0.1%) of obs with no reconstructed wage
filter(!is.na(wage_new)) %>%
# verify new reconstructed wage is essentially same as Atl Fed wage
verify(abs(wage_new - wage_old) < 0.0001) %>%
# now that we know how ATL Fed creates wage, now let us create binned wages
# following Census plan:
# https://www.census.gov/content/dam/Census/programs-surveys/cps/updated-2022-cps-puf-changes.pdf
# create binned hourly wage for hourly workers
bin_hourly(new = hourly_wage_binned, old = hourly_wage) %>%
# create binned weekly earnings for non-hourly workers
bin_weekly(new = weekly_wage_binned, old = weekly_wage) %>%
# create final wage for all workers
mutate(wage_binned = case_when(
# ensure ATL Fed top-code / allocation restrictions
wageperhrtopcoded89 == 1 | wageperhrallocated89 == 1 ~ NA_real_,
# hourly-workers
paidhrly82 == 1 ~ hourly_wage_binned,
# non-hourly workers
paidhrly82 == 0 & hours82 != 0 ~ weekly_wage_binned / hours82
)) %>%
# censor data like ATL Fed does
mutate(wage_binned = if_else(wage_binned > 100, 100, wage_binned)) %>%
# create within person wage growth in logs
arrange(personid, date) %>%
group_by(personid) %>%
mutate(
wgt_old = (log(wage_old) - lag(log(wage_old))) * 100,
wgt_binned = (log(wage_binned) - lag(log(wage_binned))) * 100
) %>%
ungroup()
# create wgt variable in microdata
wgt_data_cleaned <- wgt_data_prepped %>%
# only look at 1997+
filter(date >= ymd("1997-01-01")) %>%
# enforce ATL wage tracker restrictions, for replication
filter(!is.na(wgt_original)) %>%
# about 0.2% of the sample doesn't have wgt_old but has wgt_original
# going to drop those obs: doesn't seem to have a sizable effect on replication
filter(!is.na(wgt_old)) %>%
# very small number of wgt_binned missing because in (only) one period their wages
# sufficiently low to be censored by binning procedure
filter(!is.na(wgt_binned))
# some more microdata checks
wgt_data_cleaned %>%
# check that we are using the appropriate wage variable construction
# by verifying wgt_old and wgt_original are close to equal
verify(abs(wgt_old - wgt_original) < 0.0001)
# aggregate and smooth data
wgt_data_aggregated <- wgt_data_cleaned %>%
group_by(date) %>%
summarize(across(wgt_original|wgt_binned, list("median" = median))) %>%
mutate(across(
-date,
~ slide_dbl(.x, mean, .before = 2, .complete = TRUE),
.names = "{.col}_3ma"
))
# aggregate data check: compare with published WGT
# confirm replication only off by at most 0.1 percentage points
read_csv("wgt_published.csv") %>%
inner_join(wgt_data_aggregated, by = "date") %>%
mutate(wgt_replication = round(wgt_original_median_3ma, 1)) %>%
verify(abs(wgt_replication - wgt_published_overall) < 0.11)
wgt_data_aggregated %>%
select(date, "Original data" = wgt_original_median_3ma, "Rounded data" = wgt_binned_median_3ma) %>%
pivot_longer(-date) %>%
mutate(value = value / 100) %>%
mutate(name = ordered(name, levels = c("Rounded data", "Original data"))) %>%
ggplot(aes(x = date, y = value, group = name, color = name)) +
geom_point(alpha = 0.9) +
geom_line() +
scale_y_continuous(
limits = c(0, 0.07),
labels = scales::label_percent(accuracy = 1.0)
) +
scale_colour_hue(l = 50) +
scale_x_date(
date_labels = "%Y",
breaks = seq(from = ymd("2000-01-01"), to = ymd("2020-01-01"), by = "4 years")
) +
theme_ipsum() +
theme(
legend.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank()
) +
labs(
title = "Atlanta Fed Wage Tracker data with and without rounded wages",
subtitle = "Median annual log wage change, three-month moving average"
)
date wgt_published_overall wgt_published_hourly
1997/03/01 4.5 4.2
1997/04/01 4.6 4.3
1997/05/01 4.5 4.1
1997/06/01 4.6 4.4
1997/07/01 4.8 4.5
1997/08/01 4.9 4.7
1997/09/01 4.8 4.5
1997/10/01 4.8 4.5
1997/11/01 4.8 4.7
1997/12/01 4.9 4.7
1998/01/01 4.9 4.7
1998/02/01 4.8 4.6
1998/03/01 4.6 4.7
1998/04/01 4.9 4.7
1998/05/01 5.2 4.9
1998/06/01 5.4 5
1998/07/01 5.4 5.2
1998/08/01 5.3 5.2
1998/09/01 5.1 4.9
1998/10/01 5 4.7
1998/11/01 4.9 4.6
1998/12/01 5 4.8
1999/01/01 5 4.7
1999/02/01 5 4.7
1999/03/01 5.2 4.8
1999/04/01 5.2 4.9
1999/05/01 5.2 4.9
1999/06/01 4.9 4.7
1999/07/01 5 4.7
1999/08/01 4.8 4.6
1999/09/01 5 4.6
1999/10/01 5.1 4.7
1999/11/01 5.2 4.7
1999/12/01 5 4.8
2000/01/01 4.9 4.6
2000/02/01 4.8 4.6
2000/03/01 4.8 4.4
2000/04/01 4.9 4.6
2000/05/01 4.9 4.6
2000/06/01 5 4.7
2000/07/01 5.1 4.9
2000/08/01 5.1 5
2000/09/01 5.2 5
2000/10/01 5.2 4.9
2000/11/01 5.4 5.1
2000/12/01 5.4 5.1
2001/01/01 5.4 5.1
2001/02/01 5.3 4.9
2001/03/01 5.3 4.9
2001/04/01 5.3 5.1
2001/05/01 5.1 4.9
2001/06/01 5.2 4.8
2001/07/01 5.1 4.6
2001/08/01 5.2 5
2001/09/01 5 5
2001/10/01 5 5
2001/11/01 5 4.8
2001/12/01 5 4.7
2002/01/01 4.9 4.7
2002/02/01 4.7 4.5
2002/03/01 4.6 4.3
2002/04/01 4.3 4.1
2002/05/01 4.4 4
2002/06/01 4.2 4.1
2002/07/01 4.1 3.8
2002/08/01 4 3.9
2002/09/01 3.9 3.8
2002/10/01 3.8 3.8
2002/11/01 3.5 3.6
2002/12/01 3.6 3.7
2003/01/01 3.7 3.9
2003/02/01 3.9 4
2003/03/01 3.7 3.7
2003/04/01 3.5 3.5
2003/05/01 3.5 3.5
2003/06/01 3.6 3.6
2003/07/01 3.7 3.5
2003/08/01 3.5 3.3
2003/09/01 3.5 3.2
2003/10/01 3.4 3.2
2003/11/01 3.4 3.3
2003/12/01 3.3 3.1
2004/01/01 3.3 3.1
2004/02/01 3.4 3.1
2004/03/01 3.4 3.2
2004/04/01 3.4 3.2
2004/05/01 3.4 3.2
2004/06/01 3.3 3.1
2004/07/01 3.4 3.2
2004/08/01 3.5 3.3
2004/09/01 3.7 3.4
2004/10/01 3.7 3.2
2004/11/01 3.7 3.3
2004/12/01 3.5 3.2
2005/01/01 3.6 3.3
2005/02/01 3.5 3.3
2005/03/01 3.6 3.4
2005/04/01 3.5 3.5
2005/05/01 3.6 3.5
2005/06/01 3.8 3.5
2005/07/01 4 3.7
2005/08/01 4 3.8
2005/09/01 4.1 3.9
2005/10/01 4 3.8
2005/11/01 4.2 3.8
2005/12/01 4.1 3.7
2006/01/01 4 3.7
2006/02/01 3.8 3.6
2006/03/01 3.8 3.7
2006/04/01 3.8 3.6
2006/05/01 3.9 3.7
2006/06/01 3.8 3.7
2006/07/01 3.7 3.7
2006/08/01 3.8 3.7
2006/09/01 3.8 3.6
2006/10/01 3.8 3.6
2006/11/01 3.9 3.8
2006/12/01 4 4.1
2007/01/01 4.2 4.2
2007/02/01 4.1 4.1
2007/03/01 4.3 4.1
2007/04/01 4.1 4
2007/05/01 4.1 3.9
2007/06/01 4 3.7
2007/07/01 4.2 3.9
2007/08/01 4.3 4
2007/09/01 4.4 4.1
2007/10/01 4.3 4.1
2007/11/01 4.3 4
2007/12/01 4.1 4
2008/01/01 3.9 3.8
2008/02/01 3.9 3.8
2008/03/01 4 3.7
2008/04/01 4.1 3.8
2008/05/01 4.1 3.6
2008/06/01 4.1 3.7
2008/07/01 4.1 3.8
2008/08/01 4 3.9
2008/09/01 4 3.8
2008/10/01 4 3.7
2008/11/01 4 3.7
2008/12/01 3.8 3.6
2009/01/01 3.7 3.5
2009/02/01 3.4 3.2
2009/03/01 3.3 3.2
2009/04/01 3.2 2.9
2009/05/01 3.3 3
2009/06/01 3.1 2.9
2009/07/01 3 2.9
2009/08/01 2.5 2.5
2009/09/01 2.3 2.4
2009/10/01 2.1 2.3
2009/11/01 2.1 2.3
2009/12/01 1.7 1.9
2010/01/01 1.6 1.8
2010/02/01 1.7 1.6
2010/03/01 1.9 1.8
2010/04/01 1.9 1.7
2010/05/01 1.6 1.6
2010/06/01 1.7 1.6
2010/07/01 1.8 1.5
2010/08/01 2 1.7
2010/09/01 1.9 1.6
2010/10/01 1.7 1.6
2010/11/01 1.7 1.7
2010/12/01 1.7 1.7
2011/01/01 1.9 1.7
2011/02/01 1.9 1.6
2011/03/01 1.9 1.6
2011/04/01 1.9 1.8
2011/05/01 2 1.8
2011/06/01 2 1.6
2011/07/01 2.1 1.7
2011/08/01 2.2 1.9
2011/09/01 2.1 2
2011/10/01 2.2 1.9
2011/11/01 2 1.7
2011/12/01 2 1.8
2012/01/01 1.9 1.8
2012/02/01 1.9 1.8
2012/03/01 2 1.8
2012/04/01 2.2 1.9
2012/05/01 2.1 1.9
2012/06/01 2.3 2
2012/07/01 2.1 1.9
2012/08/01 2.1 1.9
2012/09/01 2.1 1.9
2012/10/01 2.1 1.8
2012/11/01 2.3 1.9
2012/12/01 2.3 1.9
2013/01/01 2.3 2
2013/02/01 2.3 1.9
2013/03/01 2.2 1.9
2013/04/01 2.2 1.9
2013/05/01 2.2 2.1
2013/06/01 2.1 1.8
2013/07/01 2.3 1.9
2013/08/01 2.3 1.9
2013/09/01 2.4 2.2
2013/10/01 2.2 2.1
2013/11/01 2 2.1
2013/12/01 2.2 2.1
2014/01/01 2.3 2.1
2014/02/01 2.5 2.4
2014/03/01 2.4 2.3
2014/04/01 2.3 2.2
2014/05/01 2.3 2.1
2014/06/01 2.3 2.1
2014/07/01 2.4 2.3
2014/08/01 2.4 2.3
2014/09/01 2.6 2.6
2014/10/01 2.8 2.7
2014/11/01 2.9 2.9
2014/12/01 2.9 2.7
2015/01/01 3 2.8
2015/02/01 3 2.7
2015/03/01 3.2 2.8
2015/04/01 3.3 2.7
2015/05/01 3.3 2.7
2015/06/01 3.2 3
2015/07/01 3.1 3.2
2015/08/01 3.1 3.2
2015/09/01 3 2.9
2015/10/01 2.9 2.7
2015/11/01 3.1 2.9
2015/12/01 3.1 2.9
2016/01/01 3.1 2.9
2016/02/01 3.2 2.8
2016/03/01 3.2 3
2016/04/01 3.4 3
2016/05/01 3.5 3
2016/06/01 3.6 3
2016/07/01 3.4 2.9
2016/08/01 3.3 3
2016/09/01 3.6 3.3
2016/10/01 3.9 3.6
2016/11/01 3.9 3.5
2016/12/01 3.5 3
2017/01/01 3.2 2.8
2017/02/01 3.2 2.7
2017/03/01 3.4 3
2017/04/01 3.5 2.9
2017/05/01 3.4 3.2
2017/06/01 3.2 3
2017/07/01 3.3 3.1
2017/08/01 3.4 3.2
2017/09/01 3.6 3.4
2017/10/01 3.4 3.2
2017/11/01 3.2 2.9
2017/12/01 2.9 2.6
2018/01/01 3 2.5
2018/02/01 2.9 2.6
2018/03/01 3.3 3
2018/04/01 3.3 3.1
2018/05/01 3.2 3.1
2018/06/01 3.2 3.1
2018/07/01 3.3 3.2
2018/08/01 3.5 3.4
2018/09/01 3.5 3.5
2018/10/01 3.7 3.7
2018/11/01 3.9 3.5
2018/12/01 3.8 3.3
2019/01/01 3.8 3.2
2019/02/01 3.4 3.2
2019/03/01 3.5 3.3
2019/04/01 3.6 3.5
2019/05/01 3.7 3.6
2019/06/01 3.9 4
2019/07/01 3.9 3.6
2019/08/01 3.7 3.5
2019/09/01 3.6 3.3
2019/10/01 3.5 3.5
2019/11/01 3.7 3.7
2019/12/01 3.7 3.5
2020/01/01 3.8 3.5
2020/02/01 3.7 3.7
2020/03/01 3.5 3.5
2020/04/01 3.3 3.3
2020/05/01 3.5 3.1
2020/06/01 3.8 3.5
2020/07/01 3.9 3.4
2020/08/01 3.5 3.2
2020/09/01 3.5 3.2
2020/10/01 3.5 3.4
2020/11/01 3.7 3.7
2020/12/01 3.4 3.6
2021/01/01 3.4 3.5
2021/02/01 3.4 3.4
2021/03/01 3.4 3.3
2021/04/01 3.2 3.2
2021/05/01 3 3.1
2021/06/01 3.2 3.5
2021/07/01 3.7 3.9
2021/08/01 3.9 4.1
2021/09/01 4.2 4.3
2021/10/01 4.1 4.2
2021/11/01 4.3 4.4
2021/12/01 4.5 4.6
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment