Skip to content

Instantly share code, notes, and snippets.

@plnnr
Last active August 20, 2018 17:29
Show Gist options
  • Save plnnr/8a218345425c13a47078a83b331d0df1 to your computer and use it in GitHub Desktop.
Save plnnr/8a218345425c13a47078a83b331d0df1 to your computer and use it in GitHub Desktop.
Get median income by race for the year 2000.
if(!require(pacman)){install.packages(pacman);library(pacman)}
p_load(tidyverse, tidycensus, rio, viridis, sf, leaflet, ggthemes)
# This function does not work with passing vectors to function parameters (e.g., dec_state and dec_county)
get_median_hh_income_by_race_2000 <- function(vector_of_tract_fips, compatibility_mode = "1990", dec_state=c("OR", "WA"), dec_county = c("051", "005", "009", "067", "071", "011", "059")) {
if(!require(pacman)){install.packages(pacman);library(pacman)}
p_load(tidycensus, tidyverse)
race_income_vars_00 <- c('P151A001', 'P151A002', 'P151A003', 'P151A004', 'P151A005', 'P151A006', 'P151A007', 'P151A008',
'P151A009', 'P151A010', 'P151A011', 'P151A012', 'P151A013', 'P151A014', 'P151A015', 'P151A016',
'P151A017', 'P151B001', 'P151B002', 'P151B003', 'P151B004', 'P151B005', 'P151B006', 'P151B007',
'P151B008', 'P151B009', 'P151B010', 'P151B011', 'P151B012', 'P151B013', 'P151B014', 'P151B015',
'P151B016', 'P151B017', 'P151C001', 'P151C002', 'P151C003', 'P151C004', 'P151C005', 'P151C006',
'P151C007', 'P151C008', 'P151C009', 'P151C010', 'P151C011', 'P151C012', 'P151C013', 'P151C014',
'P151C015', 'P151C016', 'P151C017', 'P151D001', 'P151D002', 'P151D003', 'P151D004', 'P151D005',
'P151D006', 'P151D007', 'P151D008', 'P151D009', 'P151D010', 'P151D011', 'P151D012', 'P151D013',
'P151D014', 'P151D015', 'P151D016', 'P151D017', 'P151E001', 'P151E002', 'P151E003', 'P151E004',
'P151E005', 'P151E006', 'P151E007', 'P151E008', 'P151E009', 'P151E010', 'P151E011', 'P151E012',
'P151E013', 'P151E014', 'P151E015', 'P151E016', 'P151E017', 'P151F001', 'P151F002', 'P151F003',
'P151F004', 'P151F005', 'P151F006', 'P151F007', 'P151F008', 'P151F009', 'P151F010', 'P151F011',
'P151F012', 'P151F013', 'P151F014', 'P151F015', 'P151F016', 'P151F017', 'P151G001', 'P151G002',
'P151G003', 'P151G004', 'P151G005', 'P151G006', 'P151G007', 'P151G008', 'P151G009', 'P151G010',
'P151G011', 'P151G012', 'P151G013', 'P151G014', 'P151G015', 'P151G016', 'P151G017', 'P151H001',
'P151H002', 'P151H003', 'P151H004', 'P151H005', 'P151H006', 'P151H007', 'P151H008', 'P151H009',
'P151H010', 'P151H011', 'P151H012', 'P151H013', 'P151H014', 'P151H015', 'P151H016', 'P151H017',
'P151I001', 'P151I002', 'P151I003', 'P151I004', 'P151I005', 'P151I006', 'P151I007', 'P151I008',
'P151I009', 'P151I010', 'P151I011', 'P151I012', 'P151I013', 'P151I014', 'P151I015', 'P151I016',
'P151I017')
incgroups <- c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999',
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999',
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999',
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')
raceinc <- c(paste("White,", incgroups),
paste("Black,", incgroups),
paste("AIAN,", incgroups),
paste("Asian,", incgroups),
paste("NHPI,", incgroups),
paste("Other,", incgroups),
paste("Multi,", incgroups),
paste("Hispanic,", incgroups),
paste("White NH,", incgroups))
get_income_value_keys <- function(dframe) {
return_list_of_lists <- list(c(dframe[[1]], 0),
c(dframe[[2]], 10000),
c(dframe[[3]], 15000),
c(dframe[[4]], 20000),
c(dframe[[5]], 25000),
c(dframe[[6]], 30000),
c(dframe[[7]], 35000),
c(dframe[[8]], 40000),
c(dframe[[9]], 45000),
c(dframe[[10]], 50000),
c(dframe[[11]], 60000),
c(dframe[[12]], 75000),
c(dframe[[13]], 100000),
c(dframe[[14]], 125000),
c(dframe[[15]], 150000),
c(dframe[[16]], 200000))
return(return_list_of_lists)
}
get_pareto_median <- function(list_of_lists) {
# Make sure list is sorted in ascending order
data_list <- list_of_lists[order(sapply(list_of_lists, '[[', 2))]
# Pull out elements into separate lists to make things clearer later on.
counts <- lapply(data_list, '[[', 1)
bases <- lapply(data_list, '[[', 2)
# Calculate the "widths" of each group as the difference between the bases
widths = list()
for (i in seq_along(bases[1:length(names(bases))-1])){
val = bases[[i + 1]] - bases[[i]]
widths <- append(widths, val)
}
# The last element returns an error because i + 1 is out of bounds.
# Just append 1, since there is no effective upper bound.
# Could be fixed with a tryCatch().
widths <- append(widths, 1)
# Find the group containing the median, which will be the group at which
# the cumulative counts is greater than the sum of all of the counts divided by 2
target = Reduce("+",counts) / 2
cumulative_counts = 1
index = 1
while (cumulative_counts <= target) {
cumulative_counts = cumulative_counts + counts[[index]]
index = index + 1
}
index = index - 1
# Calculate the sum of all the groups prior to the one that contains the median
previous_groups_sum = cumulative_counts - counts[[index]]
# Last, estimate the median
return(bases[[index]] + ((Reduce("+",counts) / 2 - previous_groups_sum) / counts[[index]]) * widths[[index]])
}
# Query the data
hhincome <- get_decennial(geography = "tract", variables = race_income_vars_00,
year = 2000, sumfile = "sf3", state = dec_state,
county = dec_county,
geometry = F) %>%
mutate(stcnty = substr(x = GEOID, start = 1, stop = 5)) %>%
# filter(stcnty %in% c('41005', '41009', '41051', '41067', '41071', '53011', '53059')) %>%
mutate(variable = str_replace(variable, 'P151', ''),
race_code = substr(x = variable, start = 1, stop = 1),
variable = substr(x = variable, start = 2, stop = 4)) %>%
filter(variable != "001") %>%
arrange(GEOID, race_code, variable)
hhincome$group <- rep(raceinc, length(unique(hhincome$GEOID)))
hhincome2 <- hhincome %>%
separate(group, into = c("race", "income"), sep = ", ", remove = F) %>%
mutate(income = ordered(income, levels = c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999',
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999',
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999',
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')),
race_rec = race,
race_rec = case_when(race == "Asian" | race == "NHPI" ~ "API",
race == "Other" | race == "Multi" ~ "Other",
TRUE ~ race),
poc_rec = case_when(race == "White" ~ "White",
race != "White NH" ~ "POC",
TRUE ~ "White NH")) %>% # This line recodes `race_rec` to `race` for all other instances
filter(GEOID %in% vector_of_tract_fips)
### Return income for race codes that match 2010 values
hhincome_2010_compatible <- hhincome2 %>%
select(race, income, value) %>%
group_by(race, income) %>%
summarize_all(sum) %>%
ungroup() %>%
spread(key = income, value = value)
median_inc_2010_compatible_list <- list(list('AIAN'), list('Asian'), list('Black'), list('Hispanic'), list('Multi'), list('NHPI'), list('Other'), list('White'), list('White NH'))
print("Median household income by race (2010 version)")
for (i in seq_along(median_inc_2010_compatible_list[1:9])){
current_race <- as.character(median_inc_2010_compatible_list[[i]][[1]])
current_race_income <- hhincome_2010_compatible %>% filter(race == current_race) %>% select(-race)
current_income_value_keys <- get_income_value_keys(current_race_income)
current_med_hh_income <- get_pareto_median(current_income_value_keys)
median_inc_2010_compatible_list[[i]][2] <- round(current_med_hh_income, 0)
print(paste( median_inc_2010_compatible_list[[i]][1], ":", median_inc_2010_compatible_list[[i]][2]))
}
df_2010_compatible <- data.frame(matrix(unlist(median_inc_2010_compatible_list), nrow = 9, byrow = T))
names(df_2010_compatible)[1] <- "race"
names(df_2010_compatible)[2] <- "median_hh_income"
#return(df_2010_compatible)
### Return income for race codes that match 1990 values
hhincome_1990_compatible <- hhincome2 %>%
select(race_rec, income, value) %>%
group_by(race_rec, income) %>%
summarize_all(sum) %>%
ungroup() %>%
spread(key = income, value = value)
median_inc_1990_compatible_list <- list(list('AIAN'), list('API'), list('Black'), list('Hispanic'), list('Other'), list('White'), list('White NH'))
print("Median household income by race (1990 version)")
for (i in seq_along(median_inc_1990_compatible_list[1:7])){
current_race <- as.character(median_inc_1990_compatible_list[[i]][[1]])
current_race_income <- hhincome_1990_compatible %>% filter(race_rec == current_race) %>% select(-race_rec)
current_income_value_keys <- get_income_value_keys(current_race_income)
current_med_hh_income <- get_pareto_median(current_income_value_keys)
median_inc_1990_compatible_list[[i]][2] <- round(current_med_hh_income, 0)
print(paste(median_inc_1990_compatible_list[[i]][1], ":", median_inc_1990_compatible_list[[i]][2]))
}
df_1990_compatible <- data.frame(matrix(unlist(median_inc_1990_compatible_list), nrow = 7, byrow = T))
names(df_1990_compatible)[1] <- "race"
names(df_1990_compatible)[2] <- "median_hh_income"
#return(df_1990_compatible)
### Return income for white and PoC race categories (2000)
hhincome_2000_poc <- hhincome2 %>%
select(poc_rec, income, value) %>%
group_by(poc_rec, income) %>%
summarize_all(sum) %>%
ungroup() %>%
spread(key = income, value = value)
median_inc_2000_poc_list <- list(list('White'), list('POC'), list('White NH'))
print("Median household income by race (aggregated to PoC/White)")
for (i in seq_along(median_inc_2000_poc_list[1:3])){
current_race <- as.character(median_inc_2000_poc_list[[i]][[1]])
current_race_income <- hhincome_2000_poc %>% filter(poc_rec == current_race) %>% select(-poc_rec)
current_income_value_keys <- get_income_value_keys(current_race_income)
current_med_hh_income <- get_pareto_median(current_income_value_keys)
median_inc_2000_poc_list[[i]][2] <- round(current_med_hh_income, 0)
print(paste( median_inc_2000_poc_list[[i]][1], ":", median_inc_2000_poc_list[[i]][2]))
}
df_2000_poc <- data.frame(matrix(unlist(median_inc_2000_poc_list), nrow = 3, byrow = T))
names(df_2000_poc)[1] <- "race"
names(df_2000_poc)[2] <- "median_hh_income"
#return(df_2000_poc)
if (compatibility_mode == "1990"){
return(df_1990_compatible)
} else if (compatibility_mode == "2010"){
return(df_2010_compatible)
} else if (compatibility_mode == "POC"){
return(df_2000_poc)
} else {return("ERROR")}
}
icura_list <- c('41051002401' , '41051002402' , '41051002501' , '41051002502' , '41051003100' , '41051003200' , '41051003301' ,
'41051003302' , '41051003401' , '41051003402' , '41051003501' , '41051003502' , '41051003601' , '41051003602' ,
'41051003701' , '41051003702' , '41051003801' , '41051003802' , '41051003803' , '41051003901' , '41051003902' ,
'41051002303' , '41051002203', '41051002201', '41051002202', '41051002301', '41051002302')
# This throws an error that "dec_county" not found
med_hh_race_00_icura <- get_median_hh_income_by_race_2000(icura_list, compatibility_mode = "1990", dec_state = c("OR", "WA"), dec_county = c("051", "005", "009", "067", "071", "011", "059"))
# This works as expected
med_hh_race_00_icura <- get_median_hh_income_by_race_2000(icura_list, compatibility_mode = "1990", dec_state = "OR", dec_county = "051")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment