Last active
July 10, 2017 15:41
-
-
Save pschmied/6978e9bbff3698d58bf1e95b1355c689 to your computer and use it in GitHub Desktop.
Playing with predicting gender / race from names
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(readr) | |
library(stringr) | |
library(scales) | |
library(wru) # Predict race | |
library(gender) # Predict gender | |
## Employees / salary info for Chicago | |
df_sal <- read_csv("https://data.cityofchicago.org/api/views/xzkq-xp2w/rows.csv?accessType=DOWNLOAD") | |
## Clean up / recode / format for compatibility with wru / gender packages | |
df_munged <- df_sal %>% | |
mutate( | |
firstname = toupper(str_extract(Name, "(?<=,\\s{1,10})\\p{L}+")), | |
surname = toupper(str_extract(Name, "^.+(?=,)")), | |
county = "031", # Mimic voter file | |
`Annual Salary` = as.numeric(gsub("\\$", "", `Annual Salary`)), | |
pid = 1:n() | |
) | |
gender_pred <- gender(unique(df_munged$firstname), years=c(2017 - 70, 2017 - 18)) %>% | |
select(firstname = name, gender) %>% | |
mutate(sex = gender == "female") # Because voterfile is so 1950s | |
df_munged <- left_join(df_munged, gender_pred, by = "firstname") | |
## Get census data and predict race | |
df_pred_race <- predict_race(df_munged, surname.only = TRUE, sex = TRUE) %>% | |
gather(race, prob_race, pred.whi:pred.oth) %>% | |
group_by(pid) %>% | |
slice(which.max(prob_race)) %>% | |
ungroup() %>% | |
mutate(race = recode(race, | |
pred.asi = "asian", | |
pred.bla = "black", | |
pred.his = "hispanic", # an ethnicity, not a race | |
pred.oth = "other", | |
pred.whi = "white")) | |
## race vs annual salary | |
df_pred_race %>% | |
filter(`Salary or Hourly` == "Salary" & | |
`Full or Part-Time` == "F" & | |
race != "other") %>% | |
ggplot(aes(x = race, y = `Annual Salary`)) + | |
geom_boxplot() + | |
scale_y_continuous(labels = dollar) | |
df_pred_race %>% | |
filter(`Salary or Hourly` == "Salary" & | |
`Full or Part-Time` == "F" & | |
race != "other") %>% | |
ggplot(aes(x = `Annual Salary`, fill = race)) + | |
geom_density(alpha = 0.4) + | |
scale_x_continuous(labels = dollar) | |
## ditto gender | |
df_pred_race %>% | |
filter(`Salary or Hourly` == "Salary" & | |
`Full or Part-Time` == "F" & | |
! is.na(gender)) %>% | |
ggplot(aes(x = `Annual Salary`, fill = gender)) + | |
geom_density(alpha = .4) + | |
scale_x_continuous(labels = dollar) | |
mod_lin <- lm( | |
`Annual Salary` ~ gender + race, | |
data = df_pred_race %>% | |
filter(`Full or Part-Time` == "F") | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment