Last active
March 23, 2020 17:01
-
-
Save plnnr/7a7c2502e82f97497f25de9d59fd3046 to your computer and use it in GitHub Desktop.
Calculate spatially aggregated estimate and margin of error using k nearest neighbor matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pacman::p_load(tidyverse, tigris, tidycensus, sf, sp, mapview, spdep, spdplyr, leaflet, RColorBrewer) | |
options(tigris_use_cache = T); options(tigris_class = "sf") | |
pdx_msa_definition <- c('41005', '41009', '41051', '41067', '41071', '53011', '53059') | |
pdxcnty <- substr(pdx_msa_definition, start = 3, stop = 5) | |
lep_languages <- c('lep_spanish' = 'C16001_005', 'lep_french_hatian' = 'C16001_008', | |
'lep_germanic' = 'C16001_011', 'lep_slavic' = 'C16001_014', | |
'lep_oth_indoeuro' = 'C16001_017', 'lep_korean' = 'C16001_020', | |
'lep_chinese' = 'C16001_023', 'lep_vietnamese' = 'C16001_026', | |
'lep_tagalog' = 'C16001_029', 'lep_oth_asian' = 'C16001_032', | |
'lep_arabic' = 'C16001_035', 'lep_oth_lang' = 'C16001_038') | |
total_languages <- c('ttl_speakers_base' = 'C16001_001', | |
'ttl_spanish' = 'C16001_003', 'ttl_french_hatian' = 'C16001_006', | |
'ttl_germanic' = 'C16001_009', 'ttl_slavic' = 'C16001_012', | |
'ttl_oth_indoeuro' = 'C16001_015', 'ttl_korean' = 'C16001_018', | |
'ttl_chinese' = 'C16001_021', 'ttl_vietnamese' = 'C16001_024', | |
'ttl_tagalog' = 'C16001_027', 'ttl_oth_asian' = 'C16001_030', | |
'ttl_arabic' = 'C16001_033', 'ttl_oth_lang' = 'C16001_036') | |
## Interpret CV | |
cv_interp <- function(cv) { | |
reliability <- case_when(cv > 0.4 ~ "3. Not reliable", cv >= 0.2 & cv < 0.4 ~ "2. Use caution", | |
cv < 0.2 ~ "1. Reliable") | |
return(reliability) | |
} | |
## Calculate spatially aggregated estimate and margin of error | |
## WARNING: Only works for aggregating by sum, not proportions | |
## TODO: 1) Allow vector of estimates; 2) Add method for proportions; 3) Add neighbor methods | |
calc_agg <- function(df.sf, est_var, moe_var, k = 5) { | |
est_var_enq <- enquo(est_var) | |
moe_var_enq <- enquo(moe_var) | |
## Grab nearest neighbor matrix | |
knn <- knearneigh(coordinates(as(df.sf, "Spatial")), k = k, longlat = TRUE) | |
nn <- knn$nn | |
## Square the margin of error and add placeholder cols | |
df.sf <- df.sf %>% | |
mutate(MM = !!moe_var_enq * !!moe_var_enq, | |
agg_est = 0, | |
agg_moe = 0) | |
## Iterate over the rows and column of the matrix to sum them together | |
for(row in 1:nrow(nn)) { | |
rowsum <- 0 | |
MMsum <- 0 | |
for(col in 1:ncol(nn)) { | |
contrib <- df.sf[nn[row,col],] %>% pull(!!est_var_enq) | |
MMcontrib <- df.sf[nn[row,col],]$MM | |
rowsum <- rowsum + contrib | |
MMsum <- MMsum + MMcontrib | |
} | |
df.sf[row,]$agg_est <- rowsum | |
df.sf[row,]$agg_moe <- MMsum | |
} | |
df.sf <- df.sf %>% | |
mutate(agg_est = agg_est + !!est_var_enq, ## Add the tract back to itself | |
agg_moe = sqrt(agg_moe + MM), ## Add the MoE_sq back to itself | |
cv = agg_moe / 1.645 / agg_est, | |
reliability = case_when(cv > 0.4 ~ "3. Not reliable", cv >= 0.2 & cv < 0.4 ~ "2. Use caution", | |
cv < 0.2 ~ "1. Reliable")) %>% | |
select(GEOID, !!est_var_enq, !!moe_var_enq, agg_est, agg_moe, reliability) | |
return(df.sf) | |
} | |
pdxtracts <- rbind(tracts("OR", cb = T), tracts("WA", cb = T)) %>% | |
filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>% select(GEOID) | |
languages <- get_acs(geography = "tract", | |
variables = c(total_languages, lep_languages), | |
state = c("OR", "WA"), | |
county = pdxcnty, | |
survey = "acs5", | |
year = 2018, | |
cache_table = TRUE, | |
geometry = TRUE) %>% | |
filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>% | |
mutate(area = as.numeric(st_area(st_transform(.,2913))/27880000)) %>% | |
group_by(GEOID) %>% | |
mutate(cv = moe / 1.645 / estimate, | |
reliability = cv_interp(cv), | |
share = estimate / estimate[variable == "ttl_speakers_base"], | |
share_moe = moe_prop(num = estimate, denom = estimate[variable == "ttl_speakers_base"], | |
moe_num = moe, moe_denom = moe[variable == "ttl_speakers_base"]), | |
share_cv = share_moe / 1.645 / share, | |
share_reliability = cv_interp(share_cv), | |
density = estimate / area) %>% | |
select(GEOID:moe, cv:share_reliability, density, area, geometry, -c(cv, share_cv)) # reorder columns | |
## Turn all the variables into wide format | |
languages_wide <- languages %>% | |
rename(E = estimate, M = moe, CV = reliability, P = share, P_M = share_moe, | |
P_CV = share_reliability, dens = density) %>% | |
select(-c(area, NAME)) %>% | |
st_set_geometry(NULL) %>% | |
pivot_wider(id_cols = GEOID, names_from = variable, values_from = c(E:dens)) %>% | |
left_join(., pdxtracts, by = "GEOID") %>% | |
st_as_sf(.) | |
## Spatially aggregate LEP Spanish speakers | |
aggregated <- calc_agg(languages_wide, | |
est_var = E_lep_spanish, | |
moe_var = M_lep_spanish, | |
k = 3) | |
## View and compare results | |
mapview(aggregated, zcol = 'reliability') + | |
mapview(languages_wide, zcol = 'CV_lep_spanish') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment