Last active
December 21, 2020 22:29
-
-
Save plnnr/13bae162a08e30f6da37b984c7fa6e43 to your computer and use it in GitHub Desktop.
Summarize long ACS data and make into wide format, retaining the margins of error
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Helpful web pages: | |
# https://dcl-wrangle.stanford.edu/census.html | |
# https://tarakc02.github.io/dot-density/ | |
## Run "install.packages("pacman")" in R console if you do not have pacman installed | |
pacman::p_load(tidyverse, tidycensus, tigris, sf, mapview) | |
## Optional if you want to write to a file geodatabase, uncomment and run code | |
# library(arcgisbinding) | |
# arc.check_product() | |
## Set 7-county MSA definition of state-county FIPS code | |
pdx_msa_definition <- c('41005', '41009', '41051', '41067', '41071', '53011', '53059') | |
## Variable codebook for ACS 5-year estimates; more of an FYI | |
acs19 <- load_variables(2019, "acs5", cache = TRUE) | |
## Download data for entire table as "long format" for all tracts in Oregon and Washington | |
edu_query <- get_acs(geography = "tract", | |
table = "B15003", | |
year = 2019, | |
state = c("OR", "WA")) | |
## Summarize the education data, adding new variables as needed | |
edu <- edu_query %>% | |
## Create a new field that uses a regular expression to extract | |
## the last three digits from the variable name field. E.G., "B15003_002" becomes "002" becomes 2 | |
mutate(id = str_extract(variable, "[0-9]{3}$") %>% as.integer) %>% | |
## Create a new field "education" that recodes the variable IDs based on educational levels. You can | |
## use the `acs19` codebook to figure out how you want to re-code the data. | |
mutate(education = case_when( | |
id == 1 ~ "pop25", ## Summary variable for calculating proportions | |
id %>% between(2, 18) ~ "hs_less", | |
id %>% between(19, 21) ~ "some_colg", | |
id >= 22 ~ "ba_higher", | |
)) %>% | |
## Group by tract ID and the newly created education field to summarize the data | |
group_by(GEOID, education) %>% | |
summarise(E = sum(estimate), | |
M = moe_sum(moe, estimate = estimate)) %>% | |
# Ungroup since we now have summarized data | |
ungroup() %>% | |
## Arrange by tract ID and education | |
arrange(GEOID, education) | |
## Create a "wide format" table | |
edu_final <- edu %>% | |
## Filter for only Portland MSA tracts using first 5 characters of GEOID | |
filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>% | |
## Pivot from long to wide format, so that one tract record is kept and multiple education fields are | |
## created, inluding the summary variable we created (`pop25`) | |
pivot_wider(id_cols = GEOID, | |
names_from = education, | |
values_from = c(E, M)) | |
## Optionally write table to file geodatabase, uncomment and run code | |
# arc.write("data.gdb/educational_attainment_2019", edu_final) | |
## Since we retained the summary variable, we can calculate the proportion if we wish, for example: | |
edu_final_proportion <- edu_final %>% | |
mutate(EP_ba_higher = E_ba_higher / E_pop25, ## Share of population with BA or higher | |
MP_ba_higher = moe_prop(num = E_ba_higher, ## use `moe_prop()` to calculate MoE | |
denom = E_pop25, | |
moe_num = M_ba_higher, | |
moe_denom = M_pop25) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment