Created
August 4, 2018 01:07
-
-
Save lukeholman/01b47df8fd72bb68454b713539e04db7 to your computer and use it in GitHub Desktop.
Quick model about name-gender associations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
# Here, I use 'Kim' as shorthand for all names that differ in gender association between the local and immigrant population | |
# For simplicity I assume Kim is an equally common name in both countries | |
expand.grid( | |
p_kim_local = c(0.001, 0.01, 0.05, 0.1), # True proportion of locally-born people named Kim | |
p_male_kim_immigrants = 0.90, # pTrue roportion males among foreign-born people named Kim: | |
p_male_kim_locals = 0.10, # True proportion males among locally-born people named Kim: | |
p_male_kim_GENDERIZE = seq(0.1, 0.9, length = 11), # worldwide estimate for gender of people named Kim from Genderize.io | |
p_immigrants = seq(0, 0.5, length = 11), # True proportion of researchers who are immigrants | |
p_immigrants_male = 0.8, # True proportion of males among immigrant researchers not named Kim | |
p_residents_male = 0.6 # True proportion of males among non-immigrant researchers not named Kim | |
) -> parameters | |
parameters$p_kim_foreign <- parameters$p_kim_local | |
# Calculate the real proportion of males, across immigrants and non-immigrants in the focal country | |
parameters$real_pMale <- | |
with(parameters, | |
(1 - p_immigrants) * p_kim_local * p_male_kim_locals + # pMale among non-immigrants named Kim | |
p_immigrants * p_kim_foreign * p_male_kim_immigrants + # pMale among immigrants named Kim | |
(1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim | |
p_immigrants * (1 - p_kim_local) * p_immigrants_male # pMale among immigrants NOT named Kim | |
) | |
# Estimate pMale, if we ignore country and use the world-wide estimate from Genderize.io | |
parameters$estimated_pMale_worldwide <- | |
with(parameters, | |
(1 - p_immigrants) * p_kim_local * p_male_kim_GENDERIZE + # pMale among non-immigrants named Kim | |
p_immigrants * p_kim_foreign * p_male_kim_GENDERIZE + # pMale among immigrants named Kim | |
(1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim | |
p_immigrants * (1 - p_kim_local) * p_immigrants_male # pMale among immigrants NOT named Kim | |
) | |
# Estimate pMale, if we DO NOT ignore country, and use the country-specific estimate from Genderize.io | |
# I assume this ends up mis-classifying more immigrants, but improves classification of non-immigrants | |
parameters$estimated_pMale_local <- | |
with(parameters, | |
(1 - p_immigrants) * p_kim_local * p_male_kim_locals + # pMale among non-immigrants named Kim | |
p_immigrants * p_kim_foreign * p_male_kim_locals + # pMale among immigrants named Kim | |
(1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim | |
p_immigrants * (1 - p_kim_local) * p_immigrants_male # pMale among immigrants NOT named Kim | |
) | |
# Calculate the difference in the absolute error when estimating the % men | |
# Positive numbers mean it is better to use the local name-gender associations, as done by Holman et al | |
parameters$difference_in_error <- with(parameters, | |
100*abs(estimated_pMale_worldwide - real_pMale) - | |
100*abs(estimated_pMale_local - real_pMale) | |
) | |
# Blue areas are places where it is best to use the country-specific associations | |
# Red areas show where it is best to ignore them, and use the world-wide estimate | |
ggplot(parameters, | |
aes(p_immigrants, | |
p_male_kim_GENDERIZE, | |
fill = difference_in_error)) + | |
geom_tile() + | |
scale_fill_gradient2(name = "Error in estimate\nof % males") + | |
facet_wrap(~p_kim_local) + | |
xlab("Proportion of researchers who are immigrants") + | |
ylab("Worldwide frequency of men\namong people named Kim") + | |
labs(title = "Blue means it's best to not ignore the country information", | |
subtitle = "Facets show frequency of people named Kim") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment