plnnr/spatial_aggregation.R

## spatial_aggregation.R
pacman::p_load(tidyverse, tigris, tidycensus, sf, sp, mapview, spdep, spdplyr, leaflet, RColorBrewer)
options(tigris_use_cache = T); options(tigris_class = "sf")

pdx_msa_definition <- c('41005', '41009', '41051', '41067', '41071', '53011', '53059')
pdxcnty <- substr(pdx_msa_definition, start = 3, stop = 5)

lep_languages <- c('lep_spanish' = 'C16001_005', 'lep_french_hatian' = 'C16001_008',
                   'lep_germanic' = 'C16001_011', 'lep_slavic' = 'C16001_014',
                   'lep_oth_indoeuro' = 'C16001_017', 'lep_korean' = 'C16001_020',
                   'lep_chinese' = 'C16001_023', 'lep_vietnamese' = 'C16001_026',
                   'lep_tagalog' = 'C16001_029', 'lep_oth_asian' = 'C16001_032',
                   'lep_arabic' = 'C16001_035', 'lep_oth_lang' = 'C16001_038')

total_languages <- c('ttl_speakers_base' = 'C16001_001',
                     'ttl_spanish' = 'C16001_003', 'ttl_french_hatian' = 'C16001_006',
                     'ttl_germanic' = 'C16001_009', 'ttl_slavic' = 'C16001_012',
                     'ttl_oth_indoeuro' = 'C16001_015', 'ttl_korean' = 'C16001_018',
                     'ttl_chinese' = 'C16001_021', 'ttl_vietnamese' = 'C16001_024',
                     'ttl_tagalog' = 'C16001_027', 'ttl_oth_asian' = 'C16001_030',
                     'ttl_arabic' = 'C16001_033', 'ttl_oth_lang' = 'C16001_036')

## Interpret CV
cv_interp <- function(cv) {
  reliability <- case_when(cv > 0.4 ~ "3. Not reliable", cv >= 0.2 & cv < 0.4 ~ "2. Use caution",
                           cv < 0.2 ~ "1. Reliable")
  return(reliability)
}

## Calculate spatially aggregated estimate and margin of error
## WARNING: Only works for aggregating by sum, not proportions
## TODO: 1) Allow vector of estimates; 2) Add method for proportions; 3) Add neighbor methods
calc_agg <- function(df.sf, est_var, moe_var, k = 5) {
  est_var_enq <- enquo(est_var)
  moe_var_enq <- enquo(moe_var)

  ## Grab nearest neighbor matrix
  knn <- knearneigh(coordinates(as(df.sf, "Spatial")), k = k, longlat = TRUE)
  nn <- knn$nn

  ## Square the margin of error and add placeholder cols
  df.sf <- df.sf %>%
    mutate(MM = !!moe_var_enq * !!moe_var_enq,
           agg_est = 0,
           agg_moe = 0)

  ## Iterate over the rows and column of the matrix to sum them together
  for(row in 1:nrow(nn)) {
    rowsum <- 0
    MMsum <- 0
    for(col in 1:ncol(nn)) {
      contrib <- df.sf[nn[row,col],] %>% pull(!!est_var_enq)
      MMcontrib <- df.sf[nn[row,col],]$MM
      rowsum <- rowsum + contrib
      MMsum <- MMsum + MMcontrib
    }
    df.sf[row,]$agg_est <- rowsum
    df.sf[row,]$agg_moe <- MMsum
  }

  df.sf <- df.sf %>%
    mutate(agg_est = agg_est + !!est_var_enq, ## Add the tract back to itself
           agg_moe = sqrt(agg_moe + MM), ## Add the MoE_sq back to itself
           cv = agg_moe / 1.645 / agg_est,
           reliability = case_when(cv > 0.4 ~ "3. Not reliable", cv >= 0.2 & cv < 0.4 ~ "2. Use caution",
                                   cv < 0.2 ~ "1. Reliable")) %>%
    select(GEOID, !!est_var_enq, !!moe_var_enq, agg_est, agg_moe, reliability)

  return(df.sf)

}

pdxtracts <- rbind(tracts("OR", cb = T), tracts("WA", cb = T)) %>%
  filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>% select(GEOID)


languages <- get_acs(geography = "tract",
                     variables = c(total_languages, lep_languages),
                     state = c("OR", "WA"),
                     county = pdxcnty,
                     survey = "acs5",
                     year = 2018,
                     cache_table = TRUE,
                     geometry = TRUE) %>%
  filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>%
  mutate(area = as.numeric(st_area(st_transform(.,2913))/27880000)) %>%
  group_by(GEOID) %>%
  mutate(cv = moe / 1.645 / estimate,
         reliability = cv_interp(cv),
         share = estimate / estimate[variable == "ttl_speakers_base"],
         share_moe = moe_prop(num = estimate, denom = estimate[variable == "ttl_speakers_base"],
                              moe_num = moe, moe_denom = moe[variable == "ttl_speakers_base"]),
         share_cv = share_moe / 1.645 / share,
         share_reliability = cv_interp(share_cv),
         density = estimate / area) %>%
  select(GEOID:moe, cv:share_reliability, density, area, geometry, -c(cv, share_cv)) # reorder columns

## Turn all the variables into wide format
languages_wide <- languages %>%
  rename(E = estimate, M = moe, CV = reliability, P = share, P_M = share_moe,
         P_CV = share_reliability, dens = density) %>%
  select(-c(area, NAME)) %>%
  st_set_geometry(NULL) %>%
  pivot_wider(id_cols = GEOID, names_from = variable, values_from = c(E:dens)) %>%
  left_join(., pdxtracts, by = "GEOID") %>%
  st_as_sf(.)

## Spatially aggregate LEP Spanish speakers
aggregated <- calc_agg(languages_wide,
                       est_var = E_lep_spanish,
                       moe_var = M_lep_spanish,
                       k = 3)

## View and compare results
mapview(aggregated, zcol = 'reliability') +
mapview(languages_wide, zcol = 'CV_lep_spanish')
	pacman::p_load(tidyverse, tigris, tidycensus, sf, sp, mapview, spdep, spdplyr, leaflet, RColorBrewer)
	options(tigris_use_cache = T); options(tigris_class = "sf")

	pdx_msa_definition <- c('41005', '41009', '41051', '41067', '41071', '53011', '53059')
	pdxcnty <- substr(pdx_msa_definition, start = 3, stop = 5)

	lep_languages <- c('lep_spanish' = 'C16001_005', 'lep_french_hatian' = 'C16001_008',
	'lep_germanic' = 'C16001_011', 'lep_slavic' = 'C16001_014',
	'lep_oth_indoeuro' = 'C16001_017', 'lep_korean' = 'C16001_020',
	'lep_chinese' = 'C16001_023', 'lep_vietnamese' = 'C16001_026',
	'lep_tagalog' = 'C16001_029', 'lep_oth_asian' = 'C16001_032',
	'lep_arabic' = 'C16001_035', 'lep_oth_lang' = 'C16001_038')

	total_languages <- c('ttl_speakers_base' = 'C16001_001',
	'ttl_spanish' = 'C16001_003', 'ttl_french_hatian' = 'C16001_006',
	'ttl_germanic' = 'C16001_009', 'ttl_slavic' = 'C16001_012',
	'ttl_oth_indoeuro' = 'C16001_015', 'ttl_korean' = 'C16001_018',
	'ttl_chinese' = 'C16001_021', 'ttl_vietnamese' = 'C16001_024',
	'ttl_tagalog' = 'C16001_027', 'ttl_oth_asian' = 'C16001_030',
	'ttl_arabic' = 'C16001_033', 'ttl_oth_lang' = 'C16001_036')

	## Interpret CV
	cv_interp <- function(cv) {
	reliability <- case_when(cv > 0.4 ~ "3. Not reliable", cv >= 0.2 & cv < 0.4 ~ "2. Use caution",
	cv < 0.2 ~ "1. Reliable")
	return(reliability)
	}

	## Calculate spatially aggregated estimate and margin of error
	## WARNING: Only works for aggregating by sum, not proportions
	## TODO: 1) Allow vector of estimates; 2) Add method for proportions; 3) Add neighbor methods
	calc_agg <- function(df.sf, est_var, moe_var, k = 5) {
	est_var_enq <- enquo(est_var)
	moe_var_enq <- enquo(moe_var)

	## Grab nearest neighbor matrix
	knn <- knearneigh(coordinates(as(df.sf, "Spatial")), k = k, longlat = TRUE)
	nn <- knn$nn

	## Square the margin of error and add placeholder cols
	df.sf <- df.sf %>%
	mutate(MM = !!moe_var_enq * !!moe_var_enq,
	agg_est = 0,
	agg_moe = 0)

	## Iterate over the rows and column of the matrix to sum them together
	for(row in 1:nrow(nn)) {
	rowsum <- 0
	MMsum <- 0
	for(col in 1:ncol(nn)) {
	contrib <- df.sf[nn[row,col],] %>% pull(!!est_var_enq)
	MMcontrib <- df.sf[nn[row,col],]$MM
	rowsum <- rowsum + contrib
	MMsum <- MMsum + MMcontrib
	}
	df.sf[row,]$agg_est <- rowsum
	df.sf[row,]$agg_moe <- MMsum
	}

	df.sf <- df.sf %>%
	mutate(agg_est = agg_est + !!est_var_enq, ## Add the tract back to itself
	agg_moe = sqrt(agg_moe + MM), ## Add the MoE_sq back to itself
	cv = agg_moe / 1.645 / agg_est,
	reliability = case_when(cv > 0.4 ~ "3. Not reliable", cv >= 0.2 & cv < 0.4 ~ "2. Use caution",
	cv < 0.2 ~ "1. Reliable")) %>%
	select(GEOID, !!est_var_enq, !!moe_var_enq, agg_est, agg_moe, reliability)

	return(df.sf)

	}

	pdxtracts <- rbind(tracts("OR", cb = T), tracts("WA", cb = T)) %>%
	filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>% select(GEOID)


	languages <- get_acs(geography = "tract",
	variables = c(total_languages, lep_languages),
	state = c("OR", "WA"),
	county = pdxcnty,
	survey = "acs5",
	year = 2018,
	cache_table = TRUE,
	geometry = TRUE) %>%
	filter(substr(GEOID, 1, 5) %in% pdx_msa_definition) %>%
	mutate(area = as.numeric(st_area(st_transform(.,2913))/27880000)) %>%
	group_by(GEOID) %>%
	mutate(cv = moe / 1.645 / estimate,
	reliability = cv_interp(cv),
	share = estimate / estimate[variable == "ttl_speakers_base"],
	share_moe = moe_prop(num = estimate, denom = estimate[variable == "ttl_speakers_base"],
	moe_num = moe, moe_denom = moe[variable == "ttl_speakers_base"]),
	share_cv = share_moe / 1.645 / share,
	share_reliability = cv_interp(share_cv),
	density = estimate / area) %>%
	select(GEOID:moe, cv:share_reliability, density, area, geometry, -c(cv, share_cv)) # reorder columns

	## Turn all the variables into wide format
	languages_wide <- languages %>%
	rename(E = estimate, M = moe, CV = reliability, P = share, P_M = share_moe,
	P_CV = share_reliability, dens = density) %>%
	select(-c(area, NAME)) %>%
	st_set_geometry(NULL) %>%
	pivot_wider(id_cols = GEOID, names_from = variable, values_from = c(E:dens)) %>%
	left_join(., pdxtracts, by = "GEOID") %>%
	st_as_sf(.)

	## Spatially aggregate LEP Spanish speakers
	aggregated <- calc_agg(languages_wide,
	est_var = E_lep_spanish,
	moe_var = M_lep_spanish,
	k = 3)

	## View and compare results
	mapview(aggregated, zcol = 'reliability') +
	mapview(languages_wide, zcol = 'CV_lep_spanish')