Skip to content

Instantly share code, notes, and snippets.

@plnnr
Last active March 23, 2020 17:01
Show Gist options
  • Save plnnr/7a7c2502e82f97497f25de9d59fd3046 to your computer and use it in GitHub Desktop.
Save plnnr/7a7c2502e82f97497f25de9d59fd3046 to your computer and use it in GitHub Desktop.
Calculate spatially aggregated estimate and margin of error using k nearest neighbor matrix
pacman::p_load(tidyverse, tigris, tidycensus, sf, sp, mapview, spdep, spdplyr, leaflet, RColorBrewer)
options(tigris_use_cache = T); options(tigris_class = "sf")
pdx_msa_definition <- c('41005', '41009', '41051', '41067', '41071', '53011', '53059')
pdxcnty <- substr(pdx_msa_definition, start = 3, stop = 5)
lep_languages <- c('lep_spanish' = 'C16001_005', 'lep_french_hatian' = 'C16001_008',
'lep_germanic' = 'C16001_011', 'lep_slavic' = 'C16001_014',
'lep_oth_indoeuro' = 'C16001_017', 'lep_korean' = 'C16001_020',
'lep_chinese' = 'C16001_023', 'lep_vietnamese' = 'C16001_026',
'lep_tagalog' = 'C16001_029', 'lep_oth_asian' = 'C16001_032',
'lep_arabic' = 'C16001_035', 'lep_oth_lang' = 'C16001_038')
total_languages <- c('ttl_speakers_base' = 'C16001_001',
'ttl_spanish' = 'C16001_003', 'ttl_french_hatian' = 'C16001_006',
'ttl_germanic' = 'C16001_009', 'ttl_slavic' = 'C16001_012',
'ttl_oth_indoeuro' = 'C16001_015', 'ttl_korean' = 'C16001_018',
'ttl_chinese' = 'C16001_021', 'ttl_vietnamese' = 'C16001_024',
'ttl_tagalog' = 'C16001_027', 'ttl_oth_asian' = 'C16001_030',
'ttl_arabic' = 'C16001_033', 'ttl_oth_lang' = 'C16001_036')
## Interpret CV
cv_interp <- function(cv) {
reliability <- case_when(cv > 0.4 ~ "3. Not reliable", cv >= 0.2 & cv < 0.4 ~ "2. Use caution",
cv < 0.2 ~ "1. Reliable")
return(reliability)
}
## Calculate spatially aggregated estimate and margin of error
## WARNING: Only works for aggregating by sum, not proportions
## TODO: 1) Allow vector of estimates; 2) Add method for proportions; 3) Add neighbor methods
calc_agg <- function(df.sf, est_var, moe_var, k = 5) {
est_var_enq <- enquo(est_var)
moe_var_enq <- enquo(moe_var)
## Grab nearest neighbor matrix
knn <- knearneigh(coordinates(as(df.sf, "Spatial")), k = k, longlat = TRUE)
nn <- knn$nn
## Square the margin of error and add placeholder cols
df.sf <- df.sf %>%
mutate(MM = !!moe_var_enq * !!moe_var_enq,
agg_est = 0,
agg_moe = 0)
## Iterate over the rows and column of the matrix to sum them together
for(row in 1:nrow(nn)) {
rowsum <- 0
MMsum <- 0
for(col in 1:ncol(nn)) {
contrib <- df.sf[nn[row,col],] %>% pull(!!est_var_enq)
MMcontrib <- df.sf[nn[row,col],]$MM
rowsum <- rowsum + contrib
MMsum <- MMsum + MMcontrib
}
df.sf[row,]$agg_est <- rowsum
df.sf[row,]$agg_moe <- MMsum
}
df.sf <- df.sf %>%
mutate(agg_est = agg_est + !!est_var_enq, ## Add the tract back to itself
agg_moe = sqrt(agg_moe + MM), ## Add the MoE_sq back to itself
cv = agg_moe / 1.645 / agg_est,
reliability = case_when(cv > 0.4 ~ "3. Not reliable", cv >= 0.2 & cv < 0.4 ~ "2. Use caution",
cv < 0.2 ~ "1. Reliable")) %>%
select(GEOID, !!est_var_enq, !!moe_var_enq, agg_est, agg_moe, reliability)
return(df.sf)
}
pdxtracts <- rbind(tracts("OR", cb = T), tracts("WA", cb = T)) %>%
filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>% select(GEOID)
languages <- get_acs(geography = "tract",
variables = c(total_languages, lep_languages),
state = c("OR", "WA"),
county = pdxcnty,
survey = "acs5",
year = 2018,
cache_table = TRUE,
geometry = TRUE) %>%
filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>%
mutate(area = as.numeric(st_area(st_transform(.,2913))/27880000)) %>%
group_by(GEOID) %>%
mutate(cv = moe / 1.645 / estimate,
reliability = cv_interp(cv),
share = estimate / estimate[variable == "ttl_speakers_base"],
share_moe = moe_prop(num = estimate, denom = estimate[variable == "ttl_speakers_base"],
moe_num = moe, moe_denom = moe[variable == "ttl_speakers_base"]),
share_cv = share_moe / 1.645 / share,
share_reliability = cv_interp(share_cv),
density = estimate / area) %>%
select(GEOID:moe, cv:share_reliability, density, area, geometry, -c(cv, share_cv)) # reorder columns
## Turn all the variables into wide format
languages_wide <- languages %>%
rename(E = estimate, M = moe, CV = reliability, P = share, P_M = share_moe,
P_CV = share_reliability, dens = density) %>%
select(-c(area, NAME)) %>%
st_set_geometry(NULL) %>%
pivot_wider(id_cols = GEOID, names_from = variable, values_from = c(E:dens)) %>%
left_join(., pdxtracts, by = "GEOID") %>%
st_as_sf(.)
## Spatially aggregate LEP Spanish speakers
aggregated <- calc_agg(languages_wide,
est_var = E_lep_spanish,
moe_var = M_lep_spanish,
k = 3)
## View and compare results
mapview(aggregated, zcol = 'reliability') +
mapview(languages_wide, zcol = 'CV_lep_spanish')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment