erikgregorywebb/ssa-baby-names.R

## ssa-baby-names.R
    # https://www.ssa.gov/oact/babynames/limits.html

    # clear directory
    rm(list = ls())
    setwd("~/Documents/Python/baby-names")

    # call libraries
    library(dplyr)
    library(stringr)
    library(scales)
    library(gridExtra)

    ### IMPORT, CLEAN ###

    # download, unzip file
    file = 'https://www.ssa.gov/oact/babynames/names.zip'
    download.file(file, destfile = 'names.zip', quiet=TRUE)
    unzip('names.zip', exdir = '~/Documents/Python/baby-names/years')

    # determine list of file names
    setwd("~/Documents/Python/baby-names/years")
    files = list.files()
    files = files %>% str_subset(pattern = "^.*\\.txt")

    # compile data
    datalist = list()
    for (i in 1:length(files)) {
      path <- paste(getwd(), "/", files[i], sep = "")
      temp <- read.delim(file = path, sep = ',', header = F)
      temp$year <- substr(as.character(files[i]), 4, 7)
      datalist[[i]] <- temp
    }
    names <- do.call(rbind, datalist)

    # clean
    colnames(names) <- c('name', 'gender', 'count', 'year')
    names$year <- as.numeric(names$year)

    # save a copy
    setwd("~/Documents/Python/baby-names/")
    filename = paste('baby-names-', Sys.Date(), '.csv', sep = '')
    write.csv(names, file = filename, row.names = F, na = "")

    ### VISUALIZE ###

    # declare functions
    gender <- function(gender_input) {
      g <- ifelse(gender_input == 'M', 'Male', 'Female')
      return(g)
    }

    popularity <- function(name_input, gender_input, birth_year) {
      t <- names %>% filter(name == name_input & gender == gender_input)
      p <- ggplot(t, aes(x = t$year, y = t$count)) +
        geom_line(aes(color = t$name), size = 1) +
        geom_vline(xintercept = birth_year) +
        scale_color_manual('', values = c("#00AFBB")) +
        scale_y_continuous(labels = comma) +
        #guides(fill = FALSE, color = FALSE, linetype = FALSE, shape = FALSE) +
        labs(title = paste(name_input, gender(gender_input), sep = ', '),
             subtitle = 'Name Popularity Over Time (Count), 1880 - 2017') +
        ylab('') +  xlab('') +
        theme_minimal() + theme(plot.title = element_text(face = 'bold'))
      return(p)
    }

    compare_popularity <- function(name_input_1, gender_input_1, name_input_2, gender_input_2) {
      t <- names %>% filter((name == name_input_1 & gender == gender_input_1) | (name == name_input_2 & gender == gender_input_2))
      p <- ggplot(t, aes(x = t$year, y = t$count)) +
        geom_line(aes(color = t$name), size = 1) +
        scale_color_manual('', values = c('#00AFBB', 'gold2')) +
        scale_y_continuous(labels = comma) +
        labs(title = paste(paste(name_input_2, gender(gender_input_2), sep = ', '), 'vs.',
                           paste(name_input_1, gender(gender_input_1), sep = ', ')),
             subtitle = 'Name Comparison, Popularity Over Time (Count), 1880 - 2017') +
        ylab('') + xlab('') +
        theme_minimal() + theme(plot.title = element_text(face = 'bold'))
      return(p)
    }

    # blog plot examples
    setwd("~/Documents/Python/baby-names/images")

    p1 <- compare_popularity('Erik', 'M', 'Eric', 'M')
    p2 <- popularity('Erik', 'M', 1994)
    p3 <- compare_popularity('Ashley', 'F', 'Erik', 'M')

    p4 <- popularity('Olivia', 'F', 2000)
    p5 <- popularity('Natalie', 'F', 2002)
    p6 <- popularity('Grace', 'F', 2004)
    p7 <- popularity('Sophia', 'F', 2010)
    p8 <- popularity('Natalie', 'F', 1995)
    p9 <- popularity('Tabitha', 'F', 2002)

    p10 <- compare_popularity('Liam', 'M', 'Emma', 'F')

    # export images
    setwd("~/Documents/Python/baby-names/images")
    png('p1.png', units = 'in', width = 8, height = 5, res = 500)
    p1
    dev.off()
	# https://www.ssa.gov/oact/babynames/limits.html

	# clear directory
	rm(list = ls())
	setwd("~/Documents/Python/baby-names")

	# call libraries
	library(dplyr)
	library(stringr)
	library(scales)
	library(gridExtra)

	### IMPORT, CLEAN ###

	# download, unzip file
	file = 'https://www.ssa.gov/oact/babynames/names.zip'
	download.file(file, destfile = 'names.zip', quiet=TRUE)
	unzip('names.zip', exdir = '~/Documents/Python/baby-names/years')

	# determine list of file names
	setwd("~/Documents/Python/baby-names/years")
	files = list.files()
	files = files %>% str_subset(pattern = "^.*\\.txt")

	# compile data
	datalist = list()
	for (i in 1:length(files)) {
	path <- paste(getwd(), "/", files[i], sep = "")
	temp <- read.delim(file = path, sep = ',', header = F)
	temp$year <- substr(as.character(files[i]), 4, 7)
	datalist[[i]] <- temp
	}
	names <- do.call(rbind, datalist)

	# clean
	colnames(names) <- c('name', 'gender', 'count', 'year')
	names$year <- as.numeric(names$year)

	# save a copy
	setwd("~/Documents/Python/baby-names/")
	filename = paste('baby-names-', Sys.Date(), '.csv', sep = '')
	write.csv(names, file = filename, row.names = F, na = "")

	### VISUALIZE ###

	# declare functions
	gender <- function(gender_input) {
	g <- ifelse(gender_input == 'M', 'Male', 'Female')
	return(g)
	}

	popularity <- function(name_input, gender_input, birth_year) {
	t <- names %>% filter(name == name_input & gender == gender_input)
	p <- ggplot(t, aes(x = t$year, y = t$count)) +
	geom_line(aes(color = t$name), size = 1) +
	geom_vline(xintercept = birth_year) +
	scale_color_manual('', values = c("#00AFBB")) +
	scale_y_continuous(labels = comma) +
	#guides(fill = FALSE, color = FALSE, linetype = FALSE, shape = FALSE) +
	labs(title = paste(name_input, gender(gender_input), sep = ', '),
	subtitle = 'Name Popularity Over Time (Count), 1880 - 2017') +
	ylab('') + xlab('') +
	theme_minimal() + theme(plot.title = element_text(face = 'bold'))
	return(p)
	}

	compare_popularity <- function(name_input_1, gender_input_1, name_input_2, gender_input_2) {
	t <- names %>% filter((name == name_input_1 & gender == gender_input_1) \| (name == name_input_2 & gender == gender_input_2))
	p <- ggplot(t, aes(x = t$year, y = t$count)) +
	geom_line(aes(color = t$name), size = 1) +
	scale_color_manual('', values = c('#00AFBB', 'gold2')) +
	scale_y_continuous(labels = comma) +
	labs(title = paste(paste(name_input_2, gender(gender_input_2), sep = ', '), 'vs.',
	paste(name_input_1, gender(gender_input_1), sep = ', ')),
	subtitle = 'Name Comparison, Popularity Over Time (Count), 1880 - 2017') +
	ylab('') + xlab('') +
	theme_minimal() + theme(plot.title = element_text(face = 'bold'))
	return(p)
	}

	# blog plot examples
	setwd("~/Documents/Python/baby-names/images")

	p1 <- compare_popularity('Erik', 'M', 'Eric', 'M')
	p2 <- popularity('Erik', 'M', 1994)
	p3 <- compare_popularity('Ashley', 'F', 'Erik', 'M')

	p4 <- popularity('Olivia', 'F', 2000)
	p5 <- popularity('Natalie', 'F', 2002)
	p6 <- popularity('Grace', 'F', 2004)
	p7 <- popularity('Sophia', 'F', 2010)
	p8 <- popularity('Natalie', 'F', 1995)
	p9 <- popularity('Tabitha', 'F', 2002)

	p10 <- compare_popularity('Liam', 'M', 'Emma', 'F')

	# export images
	setwd("~/Documents/Python/baby-names/images")
	png('p1.png', units = 'in', width = 8, height = 5, res = 500)
	p1
	dev.off()