nacnudus/all-black-baby-names.R

## all-black-baby-names.R
library(tidyxl)
library(tidyverse)
library(unpivotr)
library(rvest)
library(stringr)
library(ggrepel)

download.file("https://smartstart.services.govt.nz/assets/files/Top-baby-names-1954-2016.xlsx",
              destfile = "babynames.xlsx", mode = "wb")
babies <- tidy_xlsx("babynames.xlsx")
girls <- babies$data[["Girls\' Names"]]
boys <- babies$data[["Boys\' Names"]]

# Check that the formatting hasn't interfered with the data
girls %>%
  ggplot(aes(col, row, fill = style_format, alpha = is.na(content))) +
  geom_tile() +
  scale_y_reverse() +
  scale_alpha_manual(values = c(1, .5))

girls %>%
  ggplot(aes(col, row, fill = data_type, alpha = is.na(content))) +
  geom_tile() +
  scale_y_reverse() +
  scale_alpha_manual(values = c(1, .5))

# Import the data
get_data <- function(.sheet) {
  .ranks <- # 2nd column
    .sheet %>%
    filter(col == 2, !is.na(numeric)) %>%
    select(row, col, rank = numeric)
  .years <- # 5th row
    .sheet %>%
    filter(row == 5, is.na(character), !is.na(numeric)) %>%
    group_by(`row`, numeric) %>% # Eliminate a duplicate 1985 in the boys' sheet
    summarise(col = first(col)) %>%
    ungroup %>%
    select(row, col, year = numeric) %>%
    split(.$col)
  .names <- # Columns of cells with text in them
    .sheet %>%
    filter(row >= 8, row <= 107, col >= 3, !is.na(character)) %>%
    select(row, col, name = character) %>%
    split(.$col)
  .counts <- # Columns of cells with numbers in them
    .sheet %>%
    filter(row >= 8, row <= 107, col >= 3, !is.na(numeric)) %>%
    select(row, col, count = numeric) %>%
    split(.$col)
  # Treat as small multiples, and map over them
  .all <- list(.years, .names, .counts)
  pmap_df(.all,
          function(.year, .names, .counts) {
            .counts %>%
              E(.names) %>%
              W(.ranks) %>%
              ABOVE(.year)
          }) %>%
  select(-row, -col)
}

girls_names <- get_data(girls)
boys_names <- get_data(boys)

# Get the names of All Blacks
allblacks_html <- read_html("https://en.wikipedia.org/wiki/List_of_New_Zealand_national_rugby_union_players")

allblacks <-
  allblacks_html %>%
  html_node(xpath = "//table[@class='wikitable sortable']") %>%
  html_table() %>%
  mutate(Name = str_sub(Name, start = ceiling(str_length(Name)/2) + 1),
         firstname = str_extract_all(Name, boundary("word")),
         firstname = map_chr(firstname, ~ .[1]))

# write_csv(allblacks, "allblacks.csv")

# allblacks <- read_csv("allblacks.csv")

allblacks1954 <- filter(allblacks, `Year of\ndebut` >= 1954)

# Get the All Blacks with names that were No. 1 at some point
top_allblack_names <-
  allblacks1954 %>%
  inner_join(boys_names, by = c("firstname" = "name")) %>%
  group_by(firstname) %>%
  summarise(minrank = min(rank)) %>%
  arrange(minrank) %>%
  filter(minrank == 1) %>%
  inner_join(allblacks1954, by = c("firstname" = "firstname")) %>%
  select(firstname, Name, `Year of\ndebut`, `Test\ncaps`) %>%
  mutate(`Year of\ndebut`  = as.integer(`Year of\ndebut`),
         `Test\ncaps`  = as.integer(`Test\ncaps`)) %>%
  as.data.frame

# Get the whole history of those names
top_allblacks_names_history <-
  top_allblack_names %>%
  distinct(firstname) %>%
  inner_join(boys_names, by = c("firstname" = "name"))

# Plot
top_allblack_names %>%
  filter(`Test\ncaps` >= 10) %>%
  ggplot(aes(`Year of\ndebut`, 1500, label = Name, colour = firstname)) +
  geom_text_repel() +
  geom_point() +
  geom_line(data = top_allblacks_names_history,
            aes(year, count, label = NA)) +
  ggtitle("All Black Baby Name Popularity", subtitle = "Data: https://smartstart.services.govt.nz/assets/files/Top-baby-names-1954-2016.xlsx") +
  xlab("Year, Year of Debut") +
  ylab("Number of New Zealand babies") +
  scale_colour_discrete(name = "First Name") +
  theme_minimal()

ggsave("all-black-babies.png")
	library(tidyxl)
	library(tidyverse)
	library(unpivotr)
	library(rvest)
	library(stringr)
	library(ggrepel)

	download.file("https://smartstart.services.govt.nz/assets/files/Top-baby-names-1954-2016.xlsx",
	destfile = "babynames.xlsx", mode = "wb")
	babies <- tidy_xlsx("babynames.xlsx")
	girls <- babies$data[["Girls\' Names"]]
	boys <- babies$data[["Boys\' Names"]]

	# Check that the formatting hasn't interfered with the data
	girls %>%
	ggplot(aes(col, row, fill = style_format, alpha = is.na(content))) +
	geom_tile() +
	scale_y_reverse() +
	scale_alpha_manual(values = c(1, .5))

	girls %>%
	ggplot(aes(col, row, fill = data_type, alpha = is.na(content))) +
	geom_tile() +
	scale_y_reverse() +
	scale_alpha_manual(values = c(1, .5))

	# Import the data
	get_data <- function(.sheet) {
	.ranks <- # 2nd column
	.sheet %>%
	filter(col == 2, !is.na(numeric)) %>%
	select(row, col, rank = numeric)
	.years <- # 5th row
	.sheet %>%
	filter(row == 5, is.na(character), !is.na(numeric)) %>%
	group_by(`row`, numeric) %>% # Eliminate a duplicate 1985 in the boys' sheet
	summarise(col = first(col)) %>%
	ungroup %>%
	select(row, col, year = numeric) %>%
	split(.$col)
	.names <- # Columns of cells with text in them
	.sheet %>%
	filter(row >= 8, row <= 107, col >= 3, !is.na(character)) %>%
	select(row, col, name = character) %>%
	split(.$col)
	.counts <- # Columns of cells with numbers in them
	.sheet %>%
	filter(row >= 8, row <= 107, col >= 3, !is.na(numeric)) %>%
	select(row, col, count = numeric) %>%
	split(.$col)
	# Treat as small multiples, and map over them
	.all <- list(.years, .names, .counts)
	pmap_df(.all,
	function(.year, .names, .counts) {
	.counts %>%
	E(.names) %>%
	W(.ranks) %>%
	ABOVE(.year)
	}) %>%
	select(-row, -col)
	}

	girls_names <- get_data(girls)
	boys_names <- get_data(boys)

	# Get the names of All Blacks
	allblacks_html <- read_html("https://en.wikipedia.org/wiki/List_of_New_Zealand_national_rugby_union_players")

	allblacks <-
	allblacks_html %>%
	html_node(xpath = "//table[@class='wikitable sortable']") %>%
	html_table() %>%
	mutate(Name = str_sub(Name, start = ceiling(str_length(Name)/2) + 1),
	firstname = str_extract_all(Name, boundary("word")),
	firstname = map_chr(firstname, ~ .[1]))

	# write_csv(allblacks, "allblacks.csv")

	# allblacks <- read_csv("allblacks.csv")

	allblacks1954 <- filter(allblacks, `Year of\ndebut` >= 1954)

	# Get the All Blacks with names that were No. 1 at some point
	top_allblack_names <-
	allblacks1954 %>%
	inner_join(boys_names, by = c("firstname" = "name")) %>%
	group_by(firstname) %>%
	summarise(minrank = min(rank)) %>%
	arrange(minrank) %>%
	filter(minrank == 1) %>%
	inner_join(allblacks1954, by = c("firstname" = "firstname")) %>%
	select(firstname, Name, `Year of\ndebut`, `Test\ncaps`) %>%
	mutate(`Year of\ndebut` = as.integer(`Year of\ndebut`),
	`Test\ncaps` = as.integer(`Test\ncaps`)) %>%
	as.data.frame

	# Get the whole history of those names
	top_allblacks_names_history <-
	top_allblack_names %>%
	distinct(firstname) %>%
	inner_join(boys_names, by = c("firstname" = "name"))

	# Plot
	top_allblack_names %>%
	filter(`Test\ncaps` >= 10) %>%
	ggplot(aes(`Year of\ndebut`, 1500, label = Name, colour = firstname)) +
	geom_text_repel() +
	geom_point() +
	geom_line(data = top_allblacks_names_history,
	aes(year, count, label = NA)) +
	ggtitle("All Black Baby Name Popularity", subtitle = "Data: https://smartstart.services.govt.nz/assets/files/Top-baby-names-1954-2016.xlsx") +
	xlab("Year, Year of Debut") +
	ylab("Number of New Zealand babies") +
	scale_colour_discrete(name = "First Name") +
	theme_minimal()

	ggsave("all-black-babies.png")