Skip to content

Instantly share code, notes, and snippets.

@erikgregorywebb
Last active January 6, 2019 01:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erikgregorywebb/54edba09f1456217295358cb962b7d2c to your computer and use it in GitHub Desktop.
Save erikgregorywebb/54edba09f1456217295358cb962b7d2c to your computer and use it in GitHub Desktop.
# https://www.ssa.gov/oact/babynames/limits.html
# clear directory
rm(list = ls())
setwd("~/Documents/Python/baby-names")
# call libraries
library(dplyr)
library(stringr)
library(scales)
library(gridExtra)
### IMPORT, CLEAN ###
# download, unzip file
file = 'https://www.ssa.gov/oact/babynames/names.zip'
download.file(file, destfile = 'names.zip', quiet=TRUE)
unzip('names.zip', exdir = '~/Documents/Python/baby-names/years')
# determine list of file names
setwd("~/Documents/Python/baby-names/years")
files = list.files()
files = files %>% str_subset(pattern = "^.*\\.txt")
# compile data
datalist = list()
for (i in 1:length(files)) {
path <- paste(getwd(), "/", files[i], sep = "")
temp <- read.delim(file = path, sep = ',', header = F)
temp$year <- substr(as.character(files[i]), 4, 7)
datalist[[i]] <- temp
}
names <- do.call(rbind, datalist)
# clean
colnames(names) <- c('name', 'gender', 'count', 'year')
names$year <- as.numeric(names$year)
# save a copy
setwd("~/Documents/Python/baby-names/")
filename = paste('baby-names-', Sys.Date(), '.csv', sep = '')
write.csv(names, file = filename, row.names = F, na = "")
### VISUALIZE ###
# declare functions
gender <- function(gender_input) {
g <- ifelse(gender_input == 'M', 'Male', 'Female')
return(g)
}
popularity <- function(name_input, gender_input, birth_year) {
t <- names %>% filter(name == name_input & gender == gender_input)
p <- ggplot(t, aes(x = t$year, y = t$count)) +
geom_line(aes(color = t$name), size = 1) +
geom_vline(xintercept = birth_year) +
scale_color_manual('', values = c("#00AFBB")) +
scale_y_continuous(labels = comma) +
#guides(fill = FALSE, color = FALSE, linetype = FALSE, shape = FALSE) +
labs(title = paste(name_input, gender(gender_input), sep = ', '),
subtitle = 'Name Popularity Over Time (Count), 1880 - 2017') +
ylab('') + xlab('') +
theme_minimal() + theme(plot.title = element_text(face = 'bold'))
return(p)
}
compare_popularity <- function(name_input_1, gender_input_1, name_input_2, gender_input_2) {
t <- names %>% filter((name == name_input_1 & gender == gender_input_1) | (name == name_input_2 & gender == gender_input_2))
p <- ggplot(t, aes(x = t$year, y = t$count)) +
geom_line(aes(color = t$name), size = 1) +
scale_color_manual('', values = c('#00AFBB', 'gold2')) +
scale_y_continuous(labels = comma) +
labs(title = paste(paste(name_input_2, gender(gender_input_2), sep = ', '), 'vs.',
paste(name_input_1, gender(gender_input_1), sep = ', ')),
subtitle = 'Name Comparison, Popularity Over Time (Count), 1880 - 2017') +
ylab('') + xlab('') +
theme_minimal() + theme(plot.title = element_text(face = 'bold'))
return(p)
}
# blog plot examples
setwd("~/Documents/Python/baby-names/images")
p1 <- compare_popularity('Erik', 'M', 'Eric', 'M')
p2 <- popularity('Erik', 'M', 1994)
p3 <- compare_popularity('Ashley', 'F', 'Erik', 'M')
p4 <- popularity('Olivia', 'F', 2000)
p5 <- popularity('Natalie', 'F', 2002)
p6 <- popularity('Grace', 'F', 2004)
p7 <- popularity('Sophia', 'F', 2010)
p8 <- popularity('Natalie', 'F', 1995)
p9 <- popularity('Tabitha', 'F', 2002)
p10 <- compare_popularity('Liam', 'M', 'Emma', 'F')
# export images
setwd("~/Documents/Python/baby-names/images")
png('p1.png', units = 'in', width = 8, height = 5, res = 500)
p1
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment