Last active
August 20, 2018 19:36
-
-
Save plnnr/d6809b5d2c9ee42401da26fe1ecfb2ec to your computer and use it in GitHub Desktop.
Get household income by race for ACS data (at least 2010 onward) by passing a list of tract FIPS codes to the function.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Get median HH income by race for 2010 and 2016 | |
# Returns an error that function parameter "acs_year" is not found. | |
get_median_hh_income_by_race_acs <- function(vector_of_tract_fips, compatibility_mode = "1990", acs_year = 2016, acs_state=c("OR", "WA"), acs_county = c("051", "005", "009", "067", "071", "011", "059")) { | |
if(!require(pacman)){install.packages(pacman);library(pacman)} | |
p_load(tidycensus, tidyverse) | |
race_income_vars_1016 <- c('B19001A_001', 'B19001A_002', 'B19001A_003', 'B19001A_004', 'B19001A_005', 'B19001A_006', 'B19001A_007', | |
'B19001A_008', 'B19001A_009', 'B19001A_010', 'B19001A_011', 'B19001A_012', 'B19001A_013', 'B19001A_014', | |
'B19001A_015', 'B19001A_016', 'B19001A_017', 'B19001B_001', 'B19001B_002', 'B19001B_003', 'B19001B_004', | |
'B19001B_005', 'B19001B_006', 'B19001B_007', 'B19001B_008', 'B19001B_009', 'B19001B_010', 'B19001B_011', | |
'B19001B_012', 'B19001B_013', 'B19001B_014', 'B19001B_015', 'B19001B_016', 'B19001B_017', 'B19001C_001', | |
'B19001C_002', 'B19001C_003', 'B19001C_004', 'B19001C_005', 'B19001C_006', 'B19001C_007', 'B19001C_008', | |
'B19001C_009', 'B19001C_010', 'B19001C_011', 'B19001C_012', 'B19001C_013', 'B19001C_014', 'B19001C_015', | |
'B19001C_016', 'B19001C_017', 'B19001D_001', 'B19001D_002', 'B19001D_003', 'B19001D_004', 'B19001D_005', | |
'B19001D_006', 'B19001D_007', 'B19001D_008', 'B19001D_009', 'B19001D_010', 'B19001D_011', 'B19001D_012', | |
'B19001D_013', 'B19001D_014', 'B19001D_015', 'B19001D_016', 'B19001D_017', 'B19001E_001', 'B19001E_002', | |
'B19001E_003', 'B19001E_004', 'B19001E_005', 'B19001E_006', 'B19001E_007', 'B19001E_008', 'B19001E_009', | |
'B19001E_010', 'B19001E_011', 'B19001E_012', 'B19001E_013', 'B19001E_014', 'B19001E_015', 'B19001E_016', | |
'B19001E_017', 'B19001F_001', 'B19001F_002', 'B19001F_003', 'B19001F_004', 'B19001F_005', 'B19001F_006', | |
'B19001F_007', 'B19001F_008', 'B19001F_009', 'B19001F_010', 'B19001F_011', 'B19001F_012', 'B19001F_013', | |
'B19001F_014', 'B19001F_015', 'B19001F_016', 'B19001F_017', 'B19001G_001', 'B19001G_002', 'B19001G_003', | |
'B19001G_004', 'B19001G_005', 'B19001G_006', 'B19001G_007', 'B19001G_008', 'B19001G_009', 'B19001G_010', | |
'B19001G_011', 'B19001G_012', 'B19001G_013', 'B19001G_014', 'B19001G_015', 'B19001G_016', 'B19001G_017', | |
'B19001H_001', 'B19001H_002', 'B19001H_003', 'B19001H_004', 'B19001H_005', 'B19001H_006', 'B19001H_007', | |
'B19001H_008', 'B19001H_009', 'B19001H_010', 'B19001H_011', 'B19001H_012', 'B19001H_013', 'B19001H_014', | |
'B19001H_015', 'B19001H_016', 'B19001H_017', 'B19001I_001', 'B19001I_002', 'B19001I_003', 'B19001I_004', | |
'B19001I_005', 'B19001I_006', 'B19001I_007', 'B19001I_008', 'B19001I_009', 'B19001I_010', 'B19001I_011', | |
'B19001I_012', 'B19001I_013', 'B19001I_014', 'B19001I_015', 'B19001I_016', 'B19001I_017') | |
incgroups <- c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999', | |
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999', | |
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', | |
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more') | |
raceinc <- c(paste("White,", incgroups), | |
paste("Black,", incgroups), | |
paste("AIAN,", incgroups), | |
paste("Asian,", incgroups), | |
paste("NHPI,", incgroups), | |
paste("Other,", incgroups), | |
paste("Multi,", incgroups), | |
paste("White NH,", incgroups), | |
paste("Hispanic,", incgroups)) | |
get_income_value_keys <- function(dframe) { | |
return_list_of_lists <- list(c(dframe[[1]], 0), | |
c(dframe[[2]], 10000), | |
c(dframe[[3]], 15000), | |
c(dframe[[4]], 20000), | |
c(dframe[[5]], 25000), | |
c(dframe[[6]], 30000), | |
c(dframe[[7]], 35000), | |
c(dframe[[8]], 40000), | |
c(dframe[[9]], 45000), | |
c(dframe[[10]], 50000), | |
c(dframe[[11]], 60000), | |
c(dframe[[12]], 75000), | |
c(dframe[[13]], 100000), | |
c(dframe[[14]], 125000), | |
c(dframe[[15]], 150000), | |
c(dframe[[16]], 200000)) | |
return(return_list_of_lists) | |
} | |
get_pareto_median <- function(list_of_lists) { | |
# Make sure list is sorted in ascending order | |
data_list <- list_of_lists[order(sapply(list_of_lists, '[[', 2))] | |
# Pull out elements into separate lists to make things clearer later on. | |
counts <- lapply(data_list, '[[', 1) | |
bases <- lapply(data_list, '[[', 2) | |
# Calculate the "widths" of each group as the difference between the bases | |
widths = list() | |
for (i in seq_along(bases[1:length(names(bases))-1])){ | |
val = bases[[i + 1]] - bases[[i]] | |
widths <- append(widths, val) | |
} | |
# The last element returns an error because i + 1 is out of bounds. | |
# Just append 1, since there is no effective upper bound. | |
# Could be fixed with a tryCatch(). | |
widths <- append(widths, 1) | |
# Find the group containing the median, which will be the group at which | |
# the cumulative counts is greater than the sum of all of the counts divided by 2 | |
target = Reduce("+",counts) / 2 | |
cumulative_counts = 1 | |
index = 1 | |
while (cumulative_counts <= target) { | |
cumulative_counts = cumulative_counts + counts[[index]] | |
index = index + 1 | |
} | |
index = index - 1 | |
# Calculate the sum of all the groups prior to the one that contains the median | |
previous_groups_sum = cumulative_counts - counts[[index]] | |
# Last, estimate the median | |
return(bases[[index]] + ((Reduce("+",counts) / 2 - previous_groups_sum) / counts[[index]]) * widths[[index]]) | |
} | |
# Query data | |
hhincome <- get_acs(geography = "tract", variables = race_income_vars_1016, | |
year = acs_year, state = acs_state, survey = 'acs5', | |
county = acs_county) %>% | |
mutate(stcnty = substr(x = GEOID, start = 1, stop = 5)) %>% | |
#filter(stcnty %in% c('41005', '41009', '41051', '41067', '41071', '53011', '53059')) | |
mutate(variable = str_replace(variable, 'B19001', ''), | |
race_code = substr(x = variable, start = 1, stop = 1), | |
variable = substr(x = variable, start = 3, stop = 5)) %>% | |
filter(variable != "001") %>% | |
arrange(GEOID, race_code, variable) | |
hhincome$group <- rep(raceinc, length(unique(hhincome$GEOID))) | |
hhincome2 <- hhincome %>% | |
separate(group, into = c("race", "income"), sep = ", ", remove = F) %>% | |
mutate(income = ordered(income, levels = c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999', | |
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999', | |
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', | |
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')), | |
race_rec = race, | |
race_rec = case_when(race == "Asian" | race == "NHPI" ~ "API", | |
race == "Other" | race == "Multi" ~ "Other", | |
TRUE ~ race), | |
poc_rec = case_when(race == "White" ~ "White", | |
race != "White NH" ~ "POC", | |
TRUE ~ "White NH")) %>% | |
filter(GEOID %in% vector_of_tract_fips) | |
### Return income for race codes that match 2010 values | |
hhincome_2010_compatible <- hhincome2 %>% | |
select(race, income, estimate) %>% | |
group_by(race, income) %>% | |
summarize_all(sum) %>% | |
ungroup() %>% | |
spread(key = income, value = estimate) | |
median_inc_2010_compatible_list <- list(list('AIAN'), list('Asian'), list('Black'), list('Hispanic'), list('Multi'), list('NHPI'), list('Other'), list('White'), list('White NH')) | |
print("Median household income by race (2010 version)") | |
for (i in seq_along(median_inc_2010_compatible_list[1:9])){ | |
current_race <- as.character(median_inc_2010_compatible_list[[i]][[1]]) | |
current_race_income <- hhincome_2010_compatible %>% filter(race == current_race) %>% select(-race) | |
current_income_value_keys <- get_income_value_keys(current_race_income) | |
current_med_hh_income <- get_pareto_median(current_income_value_keys) | |
median_inc_2010_compatible_list[[i]][2] <- round(current_med_hh_income, 0) | |
print(paste( median_inc_2010_compatible_list[[i]][1], ":", median_inc_2010_compatible_list[[i]][2])) | |
} | |
df_2010_compatible <- data.frame(matrix(unlist(median_inc_2010_compatible_list), nrow = 9, byrow = T)) | |
names(df_2010_compatible)[1] <- "race" | |
names(df_2010_compatible)[2] <- "median_hh_income" | |
#return(df_2010_compatible) | |
### Return income for race codes that match 1990 values | |
hhincome_1990_compatible <- hhincome2 %>% | |
select(race_rec, income, estimate) %>% | |
group_by(race_rec, income) %>% | |
summarize_all(sum) %>% | |
ungroup() %>% | |
spread(key = income, value = estimate) | |
median_inc_1990_compatible_list <- list(list('AIAN'), list('API'), list('Black'), list('Hispanic'), list('Other'), list('White'), list('White NH')) | |
print("Median household income by race (1990 version)") | |
for (i in seq_along(median_inc_1990_compatible_list[1:7])){ | |
current_race <- as.character(median_inc_1990_compatible_list[[i]][[1]]) | |
current_race_income <- hhincome_1990_compatible %>% filter(race_rec == current_race) %>% select(-race_rec) | |
current_income_value_keys <- get_income_value_keys(current_race_income) | |
current_med_hh_income <- get_pareto_median(current_income_value_keys) | |
median_inc_1990_compatible_list[[i]][2] <- round(current_med_hh_income, 0) | |
print(paste(median_inc_1990_compatible_list[[i]][1], ":", median_inc_1990_compatible_list[[i]][2])) | |
} | |
df_1990_compatible <- data.frame(matrix(unlist(median_inc_1990_compatible_list), nrow = 7, byrow = T)) | |
names(df_1990_compatible)[1] <- "race" | |
names(df_1990_compatible)[2] <- "median_hh_income" | |
#return(df_1990_compatible) | |
### Return income for white and PoC race categories (2010) | |
hhincome_2010_poc <- hhincome2 %>% | |
select(poc_rec, income, estimate) %>% | |
group_by(poc_rec, income) %>% | |
summarize_all(sum) %>% | |
ungroup() %>% | |
spread(key = income, value = estimate) | |
median_inc_2010_poc_list <- list(list('White'), list('POC'), list('White NH')) | |
print("Median household income by race (aggregated to PoC/White)") | |
for (i in seq_along(median_inc_2010_poc_list[1:3])){ | |
current_race <- as.character(median_inc_2010_poc_list[[i]][[1]]) | |
current_race_income <- hhincome_2010_poc %>% filter(poc_rec == current_race) %>% select(-poc_rec) | |
current_income_value_keys <- get_income_value_keys(current_race_income) | |
current_med_hh_income <- get_pareto_median(current_income_value_keys) | |
median_inc_2010_poc_list[[i]][2] <- round(current_med_hh_income, 0) | |
print(paste( median_inc_2010_poc_list[[i]][1], ":", median_inc_2010_poc_list[[i]][2])) | |
} | |
df_2010_poc <- data.frame(matrix(unlist(median_inc_2010_poc_list), nrow = 3, byrow = T)) | |
names(df_2010_poc)[1] <- "race" | |
names(df_2010_poc)[2] <- "median_hh_income" | |
#return(df_2010_poc) | |
if (compatibility_mode == "1990"){ | |
return(df_1990_compatible) | |
} else if (compatibility_mode == "2010"){ | |
return(df_2010_compatible) | |
} else if (compatibility_mode == "POC"){ | |
return(df_2010_poc) | |
} else {return("ERROR")} | |
} | |
icura_list <- c('41051002401' , '41051002402' , '41051002501' , '41051002502' , '41051003100' , '41051003200' , '41051003301' , | |
'41051003302' , '41051003401' , '41051003402' , '41051003501' , '41051003502' , '41051003601' , '41051003602' , | |
'41051003701' , '41051003702' , '41051003801' , '41051003802' , '41051003803' , '41051003901' , '41051003902' , | |
'41051002303' , '41051002203', '41051002201', '41051002202', '41051002301', '41051002302') | |
# This returns error "object 'acs_year' not found" | |
# When acs_year is hard-coded, the same error occurs | |
# When I specify within get_acs() `year = 2016` then it fails at the second parameter, which is acs_county | |
med_hh_race_16_icura <- get_median_hh_income_by_race_acs(icura_list) | |
### The code below is essentially the bones of the function above. The function above has been generalized, of course | |
# but the parameters should all be the same, more or less. | |
race_income_16 <- get_acs(geography = "tract", variables = race_income_vars_1016, | |
year = 2016, state = c("OR", "WA"), | |
county = c("051", "005", "009", "067", "071", "011", "059"), | |
geometry = T) %>% | |
mutate(stcnty = substr(x = GEOID, start = 1, stop = 5)) %>% | |
filter(stcnty %in% c('41005', '41009', '41051', '41067', '41071', '53011', '53059')) | |
race_income_16_working <- race_income_16 %>% | |
mutate(variable = str_replace(variable, 'B19001', ''), | |
year = 2016, | |
race_code = substr(x = variable, start = 1, stop = 1), | |
variable = substr(x = variable, start = 3, stop = 5)) %>% | |
filter(variable != "001") %>% | |
arrange(GEOID, race_code, variable) | |
incgroups <- c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999', | |
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999', | |
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', | |
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more') | |
raceinc <- c(paste("White,", incgroups), | |
paste("Black,", incgroups), | |
paste("AIAN,", incgroups), | |
paste("Asian,", incgroups), | |
paste("NHPI,", incgroups), | |
paste("Other,", incgroups), | |
paste("Multi,", incgroups), | |
paste("White NH,", incgroups), | |
paste("Hispanic,", incgroups)) | |
race_income_16_working$group <- rep(raceinc, length(unique(race_income_16_working$GEOID))) | |
race_income_16_working <- race_income_16_working %>% | |
separate(group, into = c("race", "income"), sep = ", ", remove = F) %>% | |
mutate(income = ordered(income, levels = c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999', | |
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999', | |
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', | |
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')), | |
race_rec = race, | |
race_rec = case_when(race == "Asian" | race == "NHPI" ~ "API", | |
race == "Other" | race == "Multi" ~ "Other", | |
TRUE ~ race)) # This line recodes `race_rec` to `race` for all other instances | |
race_income_16_summary <- race_income_16_working %>% | |
filter(GEOID %in% icura_list) %>% | |
group_by(year, race_rec, income) %>% | |
summarize(total_hh = sum(estimate), | |
total_hh_m = moe_sum(moe = moe, estimate = estimate)) | |
View(race_income_16_summary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment