Skip to content

Instantly share code, notes, and snippets.

@pschmied
Last active July 10, 2017 15:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pschmied/6978e9bbff3698d58bf1e95b1355c689 to your computer and use it in GitHub Desktop.
Save pschmied/6978e9bbff3698d58bf1e95b1355c689 to your computer and use it in GitHub Desktop.
Playing with predicting gender / race from names
library(tidyverse)
library(readr)
library(stringr)
library(scales)
library(wru) # Predict race
library(gender) # Predict gender
## Employees / salary info for Chicago
df_sal <- read_csv("https://data.cityofchicago.org/api/views/xzkq-xp2w/rows.csv?accessType=DOWNLOAD")
## Clean up / recode / format for compatibility with wru / gender packages
df_munged <- df_sal %>%
mutate(
firstname = toupper(str_extract(Name, "(?<=,\\s{1,10})\\p{L}+")),
surname = toupper(str_extract(Name, "^.+(?=,)")),
county = "031", # Mimic voter file
`Annual Salary` = as.numeric(gsub("\\$", "", `Annual Salary`)),
pid = 1:n()
)
gender_pred <- gender(unique(df_munged$firstname), years=c(2017 - 70, 2017 - 18)) %>%
select(firstname = name, gender) %>%
mutate(sex = gender == "female") # Because voterfile is so 1950s
df_munged <- left_join(df_munged, gender_pred, by = "firstname")
## Get census data and predict race
df_pred_race <- predict_race(df_munged, surname.only = TRUE, sex = TRUE) %>%
gather(race, prob_race, pred.whi:pred.oth) %>%
group_by(pid) %>%
slice(which.max(prob_race)) %>%
ungroup() %>%
mutate(race = recode(race,
pred.asi = "asian",
pred.bla = "black",
pred.his = "hispanic", # an ethnicity, not a race
pred.oth = "other",
pred.whi = "white"))
## race vs annual salary
df_pred_race %>%
filter(`Salary or Hourly` == "Salary" &
`Full or Part-Time` == "F" &
race != "other") %>%
ggplot(aes(x = race, y = `Annual Salary`)) +
geom_boxplot() +
scale_y_continuous(labels = dollar)
df_pred_race %>%
filter(`Salary or Hourly` == "Salary" &
`Full or Part-Time` == "F" &
race != "other") %>%
ggplot(aes(x = `Annual Salary`, fill = race)) +
geom_density(alpha = 0.4) +
scale_x_continuous(labels = dollar)
## ditto gender
df_pred_race %>%
filter(`Salary or Hourly` == "Salary" &
`Full or Part-Time` == "F" &
! is.na(gender)) %>%
ggplot(aes(x = `Annual Salary`, fill = gender)) +
geom_density(alpha = .4) +
scale_x_continuous(labels = dollar)
mod_lin <- lm(
`Annual Salary` ~ gender + race,
data = df_pred_race %>%
filter(`Full or Part-Time` == "F")
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment