Last active
August 29, 2015 14:27
-
-
Save andrewbtran/d3d8e04f5c86dcfa2bb0 to your computer and use it in GitHub Desktop.
Inferring gender from column of first names in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Let's do some advanced stuff. | |
# First we have to isolate the first name from the NAME field | |
payroll$first_name <- gsub(".*\\,", "", payroll$NAME) | |
payroll$first_name <- gsub(" .*", "", payroll$first_name) | |
# Bring in a library to help normalize the cases | |
library(stringr) | |
payroll$first_name <- str_to_title(payroll$first_name) | |
# Inferring gender from first name based on historical data from the Census and Social Security admin | |
install.packages("gender") | |
library(gender) | |
# Running the gender function on the payroll column that has the first names | |
# This will probably take several minutes to process. Have patience. | |
payroll_gender<- gender(payroll$first_name) | |
# You'll now have a 9mb list. Convert it to a data frame to merge with the original payroll set | |
payroll_gender <- do.call(rbind, lapply(payroll_gender, data.frame, stringsAsFactors=FALSE)) | |
# Isolating the new gender dataframe to just name and gender | |
payroll_gender <- payroll_gender[c("name", "gender")] | |
# Cleaning it up to match with the original data frame to make joining easier | |
colnames(payroll_gender) <- c("first_name", "gender") | |
# Deleting duplicates. Makes it easier to join. | |
payroll_gender <- subset(payroll_gender, !duplicated(payroll_gender$first_name)) | |
# Bringing in a package that helps with advanced data work | |
library(dplyr) | |
# Bringing together the payroll data set with the inferred gender data set | |
payroll <- left_join(payroll, payroll_gender) | |
# If no names were matched with a gender, set it to "unknown | |
payroll$gender[is.na(payroll$gender)] <- "unknown" | |
# Let's take a look at the breakdown | |
table(payroll$gender) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment