Last active
August 20, 2018 17:29
-
-
Save plnnr/8a218345425c13a47078a83b331d0df1 to your computer and use it in GitHub Desktop.
Get median income by race for the year 2000.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if(!require(pacman)){install.packages(pacman);library(pacman)} | |
p_load(tidyverse, tidycensus, rio, viridis, sf, leaflet, ggthemes) | |
# This function does not work with passing vectors to function parameters (e.g., dec_state and dec_county) | |
get_median_hh_income_by_race_2000 <- function(vector_of_tract_fips, compatibility_mode = "1990", dec_state=c("OR", "WA"), dec_county = c("051", "005", "009", "067", "071", "011", "059")) { | |
if(!require(pacman)){install.packages(pacman);library(pacman)} | |
p_load(tidycensus, tidyverse) | |
race_income_vars_00 <- c('P151A001', 'P151A002', 'P151A003', 'P151A004', 'P151A005', 'P151A006', 'P151A007', 'P151A008', | |
'P151A009', 'P151A010', 'P151A011', 'P151A012', 'P151A013', 'P151A014', 'P151A015', 'P151A016', | |
'P151A017', 'P151B001', 'P151B002', 'P151B003', 'P151B004', 'P151B005', 'P151B006', 'P151B007', | |
'P151B008', 'P151B009', 'P151B010', 'P151B011', 'P151B012', 'P151B013', 'P151B014', 'P151B015', | |
'P151B016', 'P151B017', 'P151C001', 'P151C002', 'P151C003', 'P151C004', 'P151C005', 'P151C006', | |
'P151C007', 'P151C008', 'P151C009', 'P151C010', 'P151C011', 'P151C012', 'P151C013', 'P151C014', | |
'P151C015', 'P151C016', 'P151C017', 'P151D001', 'P151D002', 'P151D003', 'P151D004', 'P151D005', | |
'P151D006', 'P151D007', 'P151D008', 'P151D009', 'P151D010', 'P151D011', 'P151D012', 'P151D013', | |
'P151D014', 'P151D015', 'P151D016', 'P151D017', 'P151E001', 'P151E002', 'P151E003', 'P151E004', | |
'P151E005', 'P151E006', 'P151E007', 'P151E008', 'P151E009', 'P151E010', 'P151E011', 'P151E012', | |
'P151E013', 'P151E014', 'P151E015', 'P151E016', 'P151E017', 'P151F001', 'P151F002', 'P151F003', | |
'P151F004', 'P151F005', 'P151F006', 'P151F007', 'P151F008', 'P151F009', 'P151F010', 'P151F011', | |
'P151F012', 'P151F013', 'P151F014', 'P151F015', 'P151F016', 'P151F017', 'P151G001', 'P151G002', | |
'P151G003', 'P151G004', 'P151G005', 'P151G006', 'P151G007', 'P151G008', 'P151G009', 'P151G010', | |
'P151G011', 'P151G012', 'P151G013', 'P151G014', 'P151G015', 'P151G016', 'P151G017', 'P151H001', | |
'P151H002', 'P151H003', 'P151H004', 'P151H005', 'P151H006', 'P151H007', 'P151H008', 'P151H009', | |
'P151H010', 'P151H011', 'P151H012', 'P151H013', 'P151H014', 'P151H015', 'P151H016', 'P151H017', | |
'P151I001', 'P151I002', 'P151I003', 'P151I004', 'P151I005', 'P151I006', 'P151I007', 'P151I008', | |
'P151I009', 'P151I010', 'P151I011', 'P151I012', 'P151I013', 'P151I014', 'P151I015', 'P151I016', | |
'P151I017') | |
incgroups <- c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999', | |
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999', | |
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', | |
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more') | |
raceinc <- c(paste("White,", incgroups), | |
paste("Black,", incgroups), | |
paste("AIAN,", incgroups), | |
paste("Asian,", incgroups), | |
paste("NHPI,", incgroups), | |
paste("Other,", incgroups), | |
paste("Multi,", incgroups), | |
paste("Hispanic,", incgroups), | |
paste("White NH,", incgroups)) | |
get_income_value_keys <- function(dframe) { | |
return_list_of_lists <- list(c(dframe[[1]], 0), | |
c(dframe[[2]], 10000), | |
c(dframe[[3]], 15000), | |
c(dframe[[4]], 20000), | |
c(dframe[[5]], 25000), | |
c(dframe[[6]], 30000), | |
c(dframe[[7]], 35000), | |
c(dframe[[8]], 40000), | |
c(dframe[[9]], 45000), | |
c(dframe[[10]], 50000), | |
c(dframe[[11]], 60000), | |
c(dframe[[12]], 75000), | |
c(dframe[[13]], 100000), | |
c(dframe[[14]], 125000), | |
c(dframe[[15]], 150000), | |
c(dframe[[16]], 200000)) | |
return(return_list_of_lists) | |
} | |
get_pareto_median <- function(list_of_lists) { | |
# Make sure list is sorted in ascending order | |
data_list <- list_of_lists[order(sapply(list_of_lists, '[[', 2))] | |
# Pull out elements into separate lists to make things clearer later on. | |
counts <- lapply(data_list, '[[', 1) | |
bases <- lapply(data_list, '[[', 2) | |
# Calculate the "widths" of each group as the difference between the bases | |
widths = list() | |
for (i in seq_along(bases[1:length(names(bases))-1])){ | |
val = bases[[i + 1]] - bases[[i]] | |
widths <- append(widths, val) | |
} | |
# The last element returns an error because i + 1 is out of bounds. | |
# Just append 1, since there is no effective upper bound. | |
# Could be fixed with a tryCatch(). | |
widths <- append(widths, 1) | |
# Find the group containing the median, which will be the group at which | |
# the cumulative counts is greater than the sum of all of the counts divided by 2 | |
target = Reduce("+",counts) / 2 | |
cumulative_counts = 1 | |
index = 1 | |
while (cumulative_counts <= target) { | |
cumulative_counts = cumulative_counts + counts[[index]] | |
index = index + 1 | |
} | |
index = index - 1 | |
# Calculate the sum of all the groups prior to the one that contains the median | |
previous_groups_sum = cumulative_counts - counts[[index]] | |
# Last, estimate the median | |
return(bases[[index]] + ((Reduce("+",counts) / 2 - previous_groups_sum) / counts[[index]]) * widths[[index]]) | |
} | |
# Query the data | |
hhincome <- get_decennial(geography = "tract", variables = race_income_vars_00, | |
year = 2000, sumfile = "sf3", state = dec_state, | |
county = dec_county, | |
geometry = F) %>% | |
mutate(stcnty = substr(x = GEOID, start = 1, stop = 5)) %>% | |
# filter(stcnty %in% c('41005', '41009', '41051', '41067', '41071', '53011', '53059')) %>% | |
mutate(variable = str_replace(variable, 'P151', ''), | |
race_code = substr(x = variable, start = 1, stop = 1), | |
variable = substr(x = variable, start = 2, stop = 4)) %>% | |
filter(variable != "001") %>% | |
arrange(GEOID, race_code, variable) | |
hhincome$group <- rep(raceinc, length(unique(hhincome$GEOID))) | |
hhincome2 <- hhincome %>% | |
separate(group, into = c("race", "income"), sep = ", ", remove = F) %>% | |
mutate(income = ordered(income, levels = c('Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999', '$20,000 to $24,999', | |
'$25,000 to $29,999', '$30,000 to $34,999', '$35,000 to $39,999', '$40,000 to $44,999', | |
'$45,000 to $49,999', '$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999', | |
'$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999', '$200,000 or more')), | |
race_rec = race, | |
race_rec = case_when(race == "Asian" | race == "NHPI" ~ "API", | |
race == "Other" | race == "Multi" ~ "Other", | |
TRUE ~ race), | |
poc_rec = case_when(race == "White" ~ "White", | |
race != "White NH" ~ "POC", | |
TRUE ~ "White NH")) %>% # This line recodes `race_rec` to `race` for all other instances | |
filter(GEOID %in% vector_of_tract_fips) | |
### Return income for race codes that match 2010 values | |
hhincome_2010_compatible <- hhincome2 %>% | |
select(race, income, value) %>% | |
group_by(race, income) %>% | |
summarize_all(sum) %>% | |
ungroup() %>% | |
spread(key = income, value = value) | |
median_inc_2010_compatible_list <- list(list('AIAN'), list('Asian'), list('Black'), list('Hispanic'), list('Multi'), list('NHPI'), list('Other'), list('White'), list('White NH')) | |
print("Median household income by race (2010 version)") | |
for (i in seq_along(median_inc_2010_compatible_list[1:9])){ | |
current_race <- as.character(median_inc_2010_compatible_list[[i]][[1]]) | |
current_race_income <- hhincome_2010_compatible %>% filter(race == current_race) %>% select(-race) | |
current_income_value_keys <- get_income_value_keys(current_race_income) | |
current_med_hh_income <- get_pareto_median(current_income_value_keys) | |
median_inc_2010_compatible_list[[i]][2] <- round(current_med_hh_income, 0) | |
print(paste( median_inc_2010_compatible_list[[i]][1], ":", median_inc_2010_compatible_list[[i]][2])) | |
} | |
df_2010_compatible <- data.frame(matrix(unlist(median_inc_2010_compatible_list), nrow = 9, byrow = T)) | |
names(df_2010_compatible)[1] <- "race" | |
names(df_2010_compatible)[2] <- "median_hh_income" | |
#return(df_2010_compatible) | |
### Return income for race codes that match 1990 values | |
hhincome_1990_compatible <- hhincome2 %>% | |
select(race_rec, income, value) %>% | |
group_by(race_rec, income) %>% | |
summarize_all(sum) %>% | |
ungroup() %>% | |
spread(key = income, value = value) | |
median_inc_1990_compatible_list <- list(list('AIAN'), list('API'), list('Black'), list('Hispanic'), list('Other'), list('White'), list('White NH')) | |
print("Median household income by race (1990 version)") | |
for (i in seq_along(median_inc_1990_compatible_list[1:7])){ | |
current_race <- as.character(median_inc_1990_compatible_list[[i]][[1]]) | |
current_race_income <- hhincome_1990_compatible %>% filter(race_rec == current_race) %>% select(-race_rec) | |
current_income_value_keys <- get_income_value_keys(current_race_income) | |
current_med_hh_income <- get_pareto_median(current_income_value_keys) | |
median_inc_1990_compatible_list[[i]][2] <- round(current_med_hh_income, 0) | |
print(paste(median_inc_1990_compatible_list[[i]][1], ":", median_inc_1990_compatible_list[[i]][2])) | |
} | |
df_1990_compatible <- data.frame(matrix(unlist(median_inc_1990_compatible_list), nrow = 7, byrow = T)) | |
names(df_1990_compatible)[1] <- "race" | |
names(df_1990_compatible)[2] <- "median_hh_income" | |
#return(df_1990_compatible) | |
### Return income for white and PoC race categories (2000) | |
hhincome_2000_poc <- hhincome2 %>% | |
select(poc_rec, income, value) %>% | |
group_by(poc_rec, income) %>% | |
summarize_all(sum) %>% | |
ungroup() %>% | |
spread(key = income, value = value) | |
median_inc_2000_poc_list <- list(list('White'), list('POC'), list('White NH')) | |
print("Median household income by race (aggregated to PoC/White)") | |
for (i in seq_along(median_inc_2000_poc_list[1:3])){ | |
current_race <- as.character(median_inc_2000_poc_list[[i]][[1]]) | |
current_race_income <- hhincome_2000_poc %>% filter(poc_rec == current_race) %>% select(-poc_rec) | |
current_income_value_keys <- get_income_value_keys(current_race_income) | |
current_med_hh_income <- get_pareto_median(current_income_value_keys) | |
median_inc_2000_poc_list[[i]][2] <- round(current_med_hh_income, 0) | |
print(paste( median_inc_2000_poc_list[[i]][1], ":", median_inc_2000_poc_list[[i]][2])) | |
} | |
df_2000_poc <- data.frame(matrix(unlist(median_inc_2000_poc_list), nrow = 3, byrow = T)) | |
names(df_2000_poc)[1] <- "race" | |
names(df_2000_poc)[2] <- "median_hh_income" | |
#return(df_2000_poc) | |
if (compatibility_mode == "1990"){ | |
return(df_1990_compatible) | |
} else if (compatibility_mode == "2010"){ | |
return(df_2010_compatible) | |
} else if (compatibility_mode == "POC"){ | |
return(df_2000_poc) | |
} else {return("ERROR")} | |
} | |
icura_list <- c('41051002401' , '41051002402' , '41051002501' , '41051002502' , '41051003100' , '41051003200' , '41051003301' , | |
'41051003302' , '41051003401' , '41051003402' , '41051003501' , '41051003502' , '41051003601' , '41051003602' , | |
'41051003701' , '41051003702' , '41051003801' , '41051003802' , '41051003803' , '41051003901' , '41051003902' , | |
'41051002303' , '41051002203', '41051002201', '41051002202', '41051002301', '41051002302') | |
# This throws an error that "dec_county" not found | |
med_hh_race_00_icura <- get_median_hh_income_by_race_2000(icura_list, compatibility_mode = "1990", dec_state = c("OR", "WA"), dec_county = c("051", "005", "009", "067", "071", "011", "059")) | |
# This works as expected | |
med_hh_race_00_icura <- get_median_hh_income_by_race_2000(icura_list, compatibility_mode = "1990", dec_state = "OR", dec_county = "051") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment