lukeholman/gist:01b47df8fd72bb68454b713539e04db7

## gistfile1.txt
library(ggplot2)

# Here, I use 'Kim' as shorthand for all names that differ in gender association between the local and immigrant population
# For simplicity I assume Kim is an equally common name in both countries
expand.grid(
  p_kim_local = c(0.001, 0.01, 0.05, 0.1), # True proportion of locally-born people named Kim
  p_male_kim_immigrants = 0.90, # pTrue roportion males among foreign-born people named Kim:
  p_male_kim_locals = 0.10, # True proportion males among locally-born people named Kim:
  p_male_kim_GENDERIZE = seq(0.1, 0.9, length = 11), # worldwide estimate for gender of people named Kim from Genderize.io
  p_immigrants = seq(0, 0.5, length = 11), # True proportion of researchers who are immigrants
  p_immigrants_male = 0.8, # True proportion of males among immigrant researchers not named Kim
  p_residents_male = 0.6 # True proportion of males among non-immigrant researchers not named Kim
) -> parameters
parameters$p_kim_foreign <- parameters$p_kim_local

# Calculate the real proportion of males, across immigrants and non-immigrants in the focal country
parameters$real_pMale <-
  with(parameters,
       (1 - p_immigrants) * p_kim_local * p_male_kim_locals + # pMale among non-immigrants named Kim
         p_immigrants * p_kim_foreign * p_male_kim_immigrants +  # pMale among immigrants named Kim
         (1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim
         p_immigrants * (1 - p_kim_local) * p_immigrants_male  # pMale among immigrants NOT named Kim
  )

# Estimate pMale, if we ignore country and use the world-wide estimate from Genderize.io
parameters$estimated_pMale_worldwide <-
  with(parameters,
       (1 - p_immigrants) * p_kim_local * p_male_kim_GENDERIZE + # pMale among non-immigrants named Kim
         p_immigrants * p_kim_foreign * p_male_kim_GENDERIZE +  # pMale among immigrants named Kim
         (1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim
         p_immigrants * (1 - p_kim_local) * p_immigrants_male  # pMale among immigrants NOT named Kim
  )

# Estimate pMale, if we DO NOT ignore country, and use the country-specific estimate from Genderize.io
# I assume this ends up mis-classifying more immigrants, but improves classification of non-immigrants
parameters$estimated_pMale_local <-
  with(parameters,
       (1 - p_immigrants) * p_kim_local * p_male_kim_locals + # pMale among non-immigrants named Kim
         p_immigrants * p_kim_foreign * p_male_kim_locals +  # pMale among immigrants named Kim
         (1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim
         p_immigrants * (1 - p_kim_local) * p_immigrants_male  # pMale among immigrants NOT named Kim
  )

# Calculate the difference in the absolute error when estimating the % men
# Positive numbers mean it is better to use the local name-gender associations, as done by Holman et al
parameters$difference_in_error <- with(parameters,
                               100*abs(estimated_pMale_worldwide - real_pMale) -
  100*abs(estimated_pMale_local - real_pMale)
)

# Blue areas are places where it is best to use the country-specific associations
# Red areas show where it is best to ignore them, and use the world-wide estimate
ggplot(parameters,
                aes(p_immigrants,
                    p_male_kim_GENDERIZE,
                    fill = difference_in_error)) +
  geom_tile() +
  scale_fill_gradient2(name = "Error in estimate\nof % males") +
  facet_wrap(~p_kim_local) +
  xlab("Proportion of researchers who are immigrants") +
  ylab("Worldwide frequency of men\namong people named Kim") +
  labs(title = "Blue means it's best to not ignore the country information",
          subtitle = "Facets show frequency of people named Kim")
	library(ggplot2)

	# Here, I use 'Kim' as shorthand for all names that differ in gender association between the local and immigrant population
	# For simplicity I assume Kim is an equally common name in both countries
	expand.grid(
	p_kim_local = c(0.001, 0.01, 0.05, 0.1), # True proportion of locally-born people named Kim
	p_male_kim_immigrants = 0.90, # pTrue roportion males among foreign-born people named Kim:
	p_male_kim_locals = 0.10, # True proportion males among locally-born people named Kim:
	p_male_kim_GENDERIZE = seq(0.1, 0.9, length = 11), # worldwide estimate for gender of people named Kim from Genderize.io
	p_immigrants = seq(0, 0.5, length = 11), # True proportion of researchers who are immigrants
	p_immigrants_male = 0.8, # True proportion of males among immigrant researchers not named Kim
	p_residents_male = 0.6 # True proportion of males among non-immigrant researchers not named Kim
	) -> parameters
	parameters$p_kim_foreign <- parameters$p_kim_local

	# Calculate the real proportion of males, across immigrants and non-immigrants in the focal country
	parameters$real_pMale <-
	with(parameters,
	(1 - p_immigrants) * p_kim_local * p_male_kim_locals + # pMale among non-immigrants named Kim
	p_immigrants * p_kim_foreign * p_male_kim_immigrants + # pMale among immigrants named Kim
	(1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim
	p_immigrants * (1 - p_kim_local) * p_immigrants_male # pMale among immigrants NOT named Kim
	)

	# Estimate pMale, if we ignore country and use the world-wide estimate from Genderize.io
	parameters$estimated_pMale_worldwide <-
	with(parameters,
	(1 - p_immigrants) * p_kim_local * p_male_kim_GENDERIZE + # pMale among non-immigrants named Kim
	p_immigrants * p_kim_foreign * p_male_kim_GENDERIZE + # pMale among immigrants named Kim
	(1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim
	p_immigrants * (1 - p_kim_local) * p_immigrants_male # pMale among immigrants NOT named Kim
	)

	# Estimate pMale, if we DO NOT ignore country, and use the country-specific estimate from Genderize.io
	# I assume this ends up mis-classifying more immigrants, but improves classification of non-immigrants
	parameters$estimated_pMale_local <-
	with(parameters,
	(1 - p_immigrants) * p_kim_local * p_male_kim_locals + # pMale among non-immigrants named Kim
	p_immigrants * p_kim_foreign * p_male_kim_locals + # pMale among immigrants named Kim
	(1 - p_immigrants) * (1 - p_kim_local) * p_residents_male + # pMale among non-immigrants NOT named Kim
	p_immigrants * (1 - p_kim_local) * p_immigrants_male # pMale among immigrants NOT named Kim
	)

	# Calculate the difference in the absolute error when estimating the % men
	# Positive numbers mean it is better to use the local name-gender associations, as done by Holman et al
	parameters$difference_in_error <- with(parameters,
	100*abs(estimated_pMale_worldwide - real_pMale) -
	100*abs(estimated_pMale_local - real_pMale)
	)

	# Blue areas are places where it is best to use the country-specific associations
	# Red areas show where it is best to ignore them, and use the world-wide estimate
	ggplot(parameters,
	aes(p_immigrants,
	p_male_kim_GENDERIZE,
	fill = difference_in_error)) +
	geom_tile() +
	scale_fill_gradient2(name = "Error in estimate\nof % males") +
	facet_wrap(~p_kim_local) +
	xlab("Proportion of researchers who are immigrants") +
	ylab("Worldwide frequency of men\namong people named Kim") +
	labs(title = "Blue means it's best to not ignore the country information",
	subtitle = "Facets show frequency of people named Kim")