plnnr/long_to_wide_acs.R

## long_to_wide_acs.R
## Helpful web pages:
# https://dcl-wrangle.stanford.edu/census.html
# https://tarakc02.github.io/dot-density/

## Run "install.packages("pacman")" in R console if you do not have pacman installed
pacman::p_load(tidyverse, tidycensus, tigris, sf, mapview)

## Optional if you want to write to a file geodatabase, uncomment and run code
# library(arcgisbinding)
# arc.check_product()

## Set 7-county MSA definition of state-county FIPS code
pdx_msa_definition <- c('41005', '41009', '41051', '41067', '41071', '53011', '53059')

## Variable codebook for ACS 5-year estimates; more of an FYI
acs19 <- load_variables(2019, "acs5", cache = TRUE)

## Download data for entire table as "long format" for all tracts in Oregon and Washington
edu_query <- get_acs(geography = "tract",
                     table = "B15003",
                     year = 2019,
                     state = c("OR", "WA"))

## Summarize the education data, adding new variables as needed
edu <- edu_query %>%
  ## Create a new field that uses a regular expression to extract
  ## the last three digits from the variable name field. E.G., "B15003_002" becomes "002" becomes 2
  mutate(id = str_extract(variable, "[0-9]{3}$") %>% as.integer) %>%
  ## Create a new field "education" that recodes the variable IDs based on educational levels. You can
  ## use the `acs19` codebook to figure out how you want to re-code the data.
  mutate(education = case_when(
    id == 1 ~ "pop25", ## Summary variable for calculating proportions
    id %>% between(2, 18) ~ "hs_less",
    id %>% between(19, 21) ~ "some_colg",
    id >= 22 ~ "ba_higher",
  )) %>%
  ## Group by tract ID and the newly created education field to summarize the data
  group_by(GEOID, education) %>%
  summarise(E = sum(estimate),
            M = moe_sum(moe, estimate = estimate)) %>%
  # Ungroup since we now have summarized data
  ungroup() %>%
  ## Arrange by tract ID and education
  arrange(GEOID, education)

## Create a "wide format" table
edu_final <- edu %>%
  ## Filter for only Portland MSA tracts using first 5 characters of GEOID
  filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>%
  ## Pivot from long to wide format, so that one tract record is kept and multiple education fields are
  ## created, inluding the summary variable we created (`pop25`)
  pivot_wider(id_cols = GEOID,
              names_from = education,
              values_from = c(E, M))

## Optionally write table to file geodatabase, uncomment and run code
# arc.write("data.gdb/educational_attainment_2019", edu_final)

## Since we retained the summary variable, we can calculate the proportion if we wish, for example:
edu_final_proportion <- edu_final %>%
  mutate(EP_ba_higher = E_ba_higher / E_pop25, ## Share of population with BA or higher
         MP_ba_higher = moe_prop(num = E_ba_higher, ## use `moe_prop()` to calculate MoE
                                 denom = E_pop25,
                                 moe_num = M_ba_higher,
                                 moe_denom = M_pop25)
  )
	## Helpful web pages:
	# https://dcl-wrangle.stanford.edu/census.html
	# https://tarakc02.github.io/dot-density/

	## Run "install.packages("pacman")" in R console if you do not have pacman installed
	pacman::p_load(tidyverse, tidycensus, tigris, sf, mapview)

	## Optional if you want to write to a file geodatabase, uncomment and run code
	# library(arcgisbinding)
	# arc.check_product()

	## Set 7-county MSA definition of state-county FIPS code
	pdx_msa_definition <- c('41005', '41009', '41051', '41067', '41071', '53011', '53059')

	## Variable codebook for ACS 5-year estimates; more of an FYI
	acs19 <- load_variables(2019, "acs5", cache = TRUE)

	## Download data for entire table as "long format" for all tracts in Oregon and Washington
	edu_query <- get_acs(geography = "tract",
	table = "B15003",
	year = 2019,
	state = c("OR", "WA"))

	## Summarize the education data, adding new variables as needed
	edu <- edu_query %>%
	## Create a new field that uses a regular expression to extract
	## the last three digits from the variable name field. E.G., "B15003_002" becomes "002" becomes 2
	mutate(id = str_extract(variable, "[0-9]{3}$") %>% as.integer) %>%
	## Create a new field "education" that recodes the variable IDs based on educational levels. You can
	## use the `acs19` codebook to figure out how you want to re-code the data.
	mutate(education = case_when(
	id == 1 ~ "pop25", ## Summary variable for calculating proportions
	id %>% between(2, 18) ~ "hs_less",
	id %>% between(19, 21) ~ "some_colg",
	id >= 22 ~ "ba_higher",
	)) %>%
	## Group by tract ID and the newly created education field to summarize the data
	group_by(GEOID, education) %>%
	summarise(E = sum(estimate),
	M = moe_sum(moe, estimate = estimate)) %>%
	# Ungroup since we now have summarized data
	ungroup() %>%
	## Arrange by tract ID and education
	arrange(GEOID, education)

	## Create a "wide format" table
	edu_final <- edu %>%
	## Filter for only Portland MSA tracts using first 5 characters of GEOID
	filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>%
	## Pivot from long to wide format, so that one tract record is kept and multiple education fields are
	## created, inluding the summary variable we created (`pop25`)
	pivot_wider(id_cols = GEOID,
	names_from = education,
	values_from = c(E, M))

	## Optionally write table to file geodatabase, uncomment and run code
	# arc.write("data.gdb/educational_attainment_2019", edu_final)

	## Since we retained the summary variable, we can calculate the proportion if we wish, for example:
	edu_final_proportion <- edu_final %>%
	mutate(EP_ba_higher = E_ba_higher / E_pop25, ## Share of population with BA or higher
	MP_ba_higher = moe_prop(num = E_ba_higher, ## use `moe_prop()` to calculate MoE
	denom = E_pop25,
	moe_num = M_ba_higher,
	moe_denom = M_pop25)
	)