Skip to content

Instantly share code, notes, and snippets.

@plnnr
Last active August 20, 2018 19:36
Show Gist options
  • Save plnnr/d6809b5d2c9ee42401da26fe1ecfb2ec to your computer and use it in GitHub Desktop.
Save plnnr/d6809b5d2c9ee42401da26fe1ecfb2ec to your computer and use it in GitHub Desktop.
Get household income by race for ACS data (at least 2010 onward) by passing a list of tract FIPS codes to the function.
## Get median HH income by race for 2010 and 2016
# Returns an error that function parameter "acs_year" is not found.
get_median_hh_income_by_race_acs <- function(vector_of_tract_fips, compatibility_mode = "1990", acs_year = 2016, acs_state=c("OR", "WA"), acs_county = c("051", "005", "009", "067", "071", "011", "059")) {
if(!require(pacman)){install.packages(pacman);library(pacman)}
p_load(tidycensus, tidyverse)
race_income_vars_1016 <- c('B19001A_001', 'B19001A_002', 'B19001A_003', 'B19001A_004', 'B19001A_005', 'B19001A_006', 'B19001A_007',
'B19001A_008', 'B19001A_009', 'B19001A_010', 'B19001A_011', 'B19001A_012', 'B19001A_013', 'B19001A_014',
'B19001A_015', 'B19001A_016', 'B19001A_017', 'B19001B_001', 'B19001B_002', 'B19001B_003', 'B19001B_004',
'B19001B_005', 'B19001B_006', 'B19001B_007', 'B19001B_008', 'B19001B_009', 'B19001B_010', 'B19001B_011',
'B19001B_012', 'B19001B_013', 'B19001B_014', 'B19001B_015', 'B19001B_016', 'B19001B_017', 'B19001C_001',
'B19001C_002', 'B19001C_003', 'B19001C_004', 'B19001C_005', 'B19001C_006', 'B19001C_007', 'B19001C_008',
'B19001C_009', 'B19001C_010', 'B19001C_011', 'B19001C_012', 'B19001C_013', 'B19001C_014', 'B19001C_015',
'B19001C_016', 'B19001C_017', 'B19001D_001', 'B19001D_002', 'B19001D_003', 'B19001D_004', 'B19001D_005',
'B19001D_006', 'B19001D_007', 'B19001D_008', 'B19001D_009', 'B19001D_010', 'B19001D_011', 'B19001D_012',
'B19001D_013', 'B19001D_014', 'B19001D_015', 'B19001D_016', 'B19001D_017', 'B19001E_001', 'B19001E_002',
'B19001E_003', 'B19001E_004', 'B19001E_005', 'B19001E_006', 'B19001E_007', 'B19001E_008', 'B19001E_009',
'B19001E_010', 'B19001E_011', 'B19001E_012', 'B19001E_013', 'B19001E_014', 'B19001E_015', 'B19001E_016',
'B19001E_017', 'B19001F_001', 'B19001F_002', 'B19001F_003', 'B19001F_004', 'B19001F_005', 'B19001F_006',
'B19001F_007', 'B19001F_008', 'B19001F_009', 'B19001F_010', 'B19001F_011', 'B19001F_012', 'B19001F_013',
'B19001F_014', 'B19001F_015', 'B19001F_016', 'B19001F_017', 'B19001G_001', 'B19001G_002', 'B19001G_003',
'B19001G_004', 'B19001G_005', 'B19001G_006', 'B19001G_007', 'B19001G_008', 'B19001G_009', 'B19001G_010',
'B19001G_011', 'B19001G_012', 'B19001G_013', 'B19001G_014', 'B19001G_015', 'B19001G_016', 'B19001G_017',
'B19001H_001', 'B19001H_002', 'B19001H_003', 'B19001H_004', 'B19001H_005', 'B19001H_006', 'B19001H_007',
'B19001H_008', 'B19001H_009', 'B19001H_010', 'B19001H_011', 'B19001H_012', 'B19001H_013', 'B19001H_014',
'B19001H_015', 'B19001H_016', 'B19001H_017', 'B19001I_001', 'B19001I_002', 'B19001I_003', 'B19001I_004',
'B19001I_005', 'B19001I_006', 'B19001I_007', 'B19001I_008', 'B19001I_009', 'B19001I_010', 'B19001I_011',
'B19001I_012', 'B19001I_013', 'B19001I_014', 'B19001I_015', 'B19001I_016', 'B19001I_017')
incgroups <- c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999',
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999',
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999',
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')
raceinc <- c(paste("White,", incgroups),
paste("Black,", incgroups),
paste("AIAN,", incgroups),
paste("Asian,", incgroups),
paste("NHPI,", incgroups),
paste("Other,", incgroups),
paste("Multi,", incgroups),
paste("White NH,", incgroups),
paste("Hispanic,", incgroups))
get_income_value_keys <- function(dframe) {
return_list_of_lists <- list(c(dframe[[1]], 0),
c(dframe[[2]], 10000),
c(dframe[[3]], 15000),
c(dframe[[4]], 20000),
c(dframe[[5]], 25000),
c(dframe[[6]], 30000),
c(dframe[[7]], 35000),
c(dframe[[8]], 40000),
c(dframe[[9]], 45000),
c(dframe[[10]], 50000),
c(dframe[[11]], 60000),
c(dframe[[12]], 75000),
c(dframe[[13]], 100000),
c(dframe[[14]], 125000),
c(dframe[[15]], 150000),
c(dframe[[16]], 200000))
return(return_list_of_lists)
}
get_pareto_median <- function(list_of_lists) {
# Make sure list is sorted in ascending order
data_list <- list_of_lists[order(sapply(list_of_lists, '[[', 2))]
# Pull out elements into separate lists to make things clearer later on.
counts <- lapply(data_list, '[[', 1)
bases <- lapply(data_list, '[[', 2)
# Calculate the "widths" of each group as the difference between the bases
widths = list()
for (i in seq_along(bases[1:length(names(bases))-1])){
val = bases[[i + 1]] - bases[[i]]
widths <- append(widths, val)
}
# The last element returns an error because i + 1 is out of bounds.
# Just append 1, since there is no effective upper bound.
# Could be fixed with a tryCatch().
widths <- append(widths, 1)
# Find the group containing the median, which will be the group at which
# the cumulative counts is greater than the sum of all of the counts divided by 2
target = Reduce("+",counts) / 2
cumulative_counts = 1
index = 1
while (cumulative_counts <= target) {
cumulative_counts = cumulative_counts + counts[[index]]
index = index + 1
}
index = index - 1
# Calculate the sum of all the groups prior to the one that contains the median
previous_groups_sum = cumulative_counts - counts[[index]]
# Last, estimate the median
return(bases[[index]] + ((Reduce("+",counts) / 2 - previous_groups_sum) / counts[[index]]) * widths[[index]])
}
# Query data
hhincome <- get_acs(geography = "tract", variables = race_income_vars_1016,
year = acs_year, state = acs_state, survey = 'acs5',
county = acs_county) %>%
mutate(stcnty = substr(x = GEOID, start = 1, stop = 5)) %>%
#filter(stcnty %in% c('41005', '41009', '41051', '41067', '41071', '53011', '53059'))
mutate(variable = str_replace(variable, 'B19001', ''),
race_code = substr(x = variable, start = 1, stop = 1),
variable = substr(x = variable, start = 3, stop = 5)) %>%
filter(variable != "001") %>%
arrange(GEOID, race_code, variable)
hhincome$group <- rep(raceinc, length(unique(hhincome$GEOID)))
hhincome2 <- hhincome %>%
separate(group, into = c("race", "income"), sep = ", ", remove = F) %>%
mutate(income = ordered(income, levels = c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999',
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999',
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999',
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')),
race_rec = race,
race_rec = case_when(race == "Asian" | race == "NHPI" ~ "API",
race == "Other" | race == "Multi" ~ "Other",
TRUE ~ race),
poc_rec = case_when(race == "White" ~ "White",
race != "White NH" ~ "POC",
TRUE ~ "White NH")) %>%
filter(GEOID %in% vector_of_tract_fips)
### Return income for race codes that match 2010 values
hhincome_2010_compatible <- hhincome2 %>%
select(race, income, estimate) %>%
group_by(race, income) %>%
summarize_all(sum) %>%
ungroup() %>%
spread(key = income, value = estimate)
median_inc_2010_compatible_list <- list(list('AIAN'), list('Asian'), list('Black'), list('Hispanic'), list('Multi'), list('NHPI'), list('Other'), list('White'), list('White NH'))
print("Median household income by race (2010 version)")
for (i in seq_along(median_inc_2010_compatible_list[1:9])){
current_race <- as.character(median_inc_2010_compatible_list[[i]][[1]])
current_race_income <- hhincome_2010_compatible %>% filter(race == current_race) %>% select(-race)
current_income_value_keys <- get_income_value_keys(current_race_income)
current_med_hh_income <- get_pareto_median(current_income_value_keys)
median_inc_2010_compatible_list[[i]][2] <- round(current_med_hh_income, 0)
print(paste( median_inc_2010_compatible_list[[i]][1], ":", median_inc_2010_compatible_list[[i]][2]))
}
df_2010_compatible <- data.frame(matrix(unlist(median_inc_2010_compatible_list), nrow = 9, byrow = T))
names(df_2010_compatible)[1] <- "race"
names(df_2010_compatible)[2] <- "median_hh_income"
#return(df_2010_compatible)
### Return income for race codes that match 1990 values
hhincome_1990_compatible <- hhincome2 %>%
select(race_rec, income, estimate) %>%
group_by(race_rec, income) %>%
summarize_all(sum) %>%
ungroup() %>%
spread(key = income, value = estimate)
median_inc_1990_compatible_list <- list(list('AIAN'), list('API'), list('Black'), list('Hispanic'), list('Other'), list('White'), list('White NH'))
print("Median household income by race (1990 version)")
for (i in seq_along(median_inc_1990_compatible_list[1:7])){
current_race <- as.character(median_inc_1990_compatible_list[[i]][[1]])
current_race_income <- hhincome_1990_compatible %>% filter(race_rec == current_race) %>% select(-race_rec)
current_income_value_keys <- get_income_value_keys(current_race_income)
current_med_hh_income <- get_pareto_median(current_income_value_keys)
median_inc_1990_compatible_list[[i]][2] <- round(current_med_hh_income, 0)
print(paste(median_inc_1990_compatible_list[[i]][1], ":", median_inc_1990_compatible_list[[i]][2]))
}
df_1990_compatible <- data.frame(matrix(unlist(median_inc_1990_compatible_list), nrow = 7, byrow = T))
names(df_1990_compatible)[1] <- "race"
names(df_1990_compatible)[2] <- "median_hh_income"
#return(df_1990_compatible)
### Return income for white and PoC race categories (2010)
hhincome_2010_poc <- hhincome2 %>%
select(poc_rec, income, estimate) %>%
group_by(poc_rec, income) %>%
summarize_all(sum) %>%
ungroup() %>%
spread(key = income, value = estimate)
median_inc_2010_poc_list <- list(list('White'), list('POC'), list('White NH'))
print("Median household income by race (aggregated to PoC/White)")
for (i in seq_along(median_inc_2010_poc_list[1:3])){
current_race <- as.character(median_inc_2010_poc_list[[i]][[1]])
current_race_income <- hhincome_2010_poc %>% filter(poc_rec == current_race) %>% select(-poc_rec)
current_income_value_keys <- get_income_value_keys(current_race_income)
current_med_hh_income <- get_pareto_median(current_income_value_keys)
median_inc_2010_poc_list[[i]][2] <- round(current_med_hh_income, 0)
print(paste( median_inc_2010_poc_list[[i]][1], ":", median_inc_2010_poc_list[[i]][2]))
}
df_2010_poc <- data.frame(matrix(unlist(median_inc_2010_poc_list), nrow = 3, byrow = T))
names(df_2010_poc)[1] <- "race"
names(df_2010_poc)[2] <- "median_hh_income"
#return(df_2010_poc)
if (compatibility_mode == "1990"){
return(df_1990_compatible)
} else if (compatibility_mode == "2010"){
return(df_2010_compatible)
} else if (compatibility_mode == "POC"){
return(df_2010_poc)
} else {return("ERROR")}
}
icura_list <- c('41051002401' , '41051002402' , '41051002501' , '41051002502' , '41051003100' , '41051003200' , '41051003301' ,
'41051003302' , '41051003401' , '41051003402' , '41051003501' , '41051003502' , '41051003601' , '41051003602' ,
'41051003701' , '41051003702' , '41051003801' , '41051003802' , '41051003803' , '41051003901' , '41051003902' ,
'41051002303' , '41051002203', '41051002201', '41051002202', '41051002301', '41051002302')
# This returns error "object 'acs_year' not found"
# When acs_year is hard-coded, the same error occurs
# When I specify within get_acs() `year = 2016` then it fails at the second parameter, which is acs_county
med_hh_race_16_icura <- get_median_hh_income_by_race_acs(icura_list)
### The code below is essentially the bones of the function above. The function above has been generalized, of course
# but the parameters should all be the same, more or less.
race_income_16 <- get_acs(geography = "tract", variables = race_income_vars_1016,
year = 2016, state = c("OR", "WA"),
county = c("051", "005", "009", "067", "071", "011", "059"),
geometry = T) %>%
mutate(stcnty = substr(x = GEOID, start = 1, stop = 5)) %>%
filter(stcnty %in% c('41005', '41009', '41051', '41067', '41071', '53011', '53059'))
race_income_16_working <- race_income_16 %>%
mutate(variable = str_replace(variable, 'B19001', ''),
year = 2016,
race_code = substr(x = variable, start = 1, stop = 1),
variable = substr(x = variable, start = 3, stop = 5)) %>%
filter(variable != "001") %>%
arrange(GEOID, race_code, variable)
incgroups <- c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999',
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999',
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999',
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')
raceinc <- c(paste("White,", incgroups),
paste("Black,", incgroups),
paste("AIAN,", incgroups),
paste("Asian,", incgroups),
paste("NHPI,", incgroups),
paste("Other,", incgroups),
paste("Multi,", incgroups),
paste("White NH,", incgroups),
paste("Hispanic,", incgroups))
race_income_16_working$group <- rep(raceinc, length(unique(race_income_16_working$GEOID)))
race_income_16_working <- race_income_16_working %>%
separate(group, into = c("race", "income"), sep = ", ", remove = F) %>%
mutate(income = ordered(income, levels = c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999',
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999',
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999',
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')),
race_rec = race,
race_rec = case_when(race == "Asian" | race == "NHPI" ~ "API",
race == "Other" | race == "Multi" ~ "Other",
TRUE ~ race)) # This line recodes `race_rec` to `race` for all other instances
race_income_16_summary <- race_income_16_working %>%
filter(GEOID %in% icura_list) %>%
group_by(year, race_rec, income) %>%
summarize(total_hh = sum(estimate),
total_hh_m = moe_sum(moe = moe, estimate = estimate))
View(race_income_16_summary)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment