Skip to content

Instantly share code, notes, and snippets.

@nacnudus
Created February 1, 2017 23:34
Show Gist options
  • Save nacnudus/116f56521fc2a746b1c873bc6addd374 to your computer and use it in GitHub Desktop.
Save nacnudus/116f56521fc2a746b1c873bc6addd374 to your computer and use it in GitHub Desktop.
library(tidyxl)
library(tidyverse)
library(unpivotr)
library(rvest)
library(stringr)
library(ggrepel)
download.file("https://smartstart.services.govt.nz/assets/files/Top-baby-names-1954-2016.xlsx",
destfile = "babynames.xlsx", mode = "wb")
babies <- tidy_xlsx("babynames.xlsx")
girls <- babies$data[["Girls\' Names"]]
boys <- babies$data[["Boys\' Names"]]
# Check that the formatting hasn't interfered with the data
girls %>%
ggplot(aes(col, row, fill = style_format, alpha = is.na(content))) +
geom_tile() +
scale_y_reverse() +
scale_alpha_manual(values = c(1, .5))
girls %>%
ggplot(aes(col, row, fill = data_type, alpha = is.na(content))) +
geom_tile() +
scale_y_reverse() +
scale_alpha_manual(values = c(1, .5))
# Import the data
get_data <- function(.sheet) {
.ranks <- # 2nd column
.sheet %>%
filter(col == 2, !is.na(numeric)) %>%
select(row, col, rank = numeric)
.years <- # 5th row
.sheet %>%
filter(row == 5, is.na(character), !is.na(numeric)) %>%
group_by(`row`, numeric) %>% # Eliminate a duplicate 1985 in the boys' sheet
summarise(col = first(col)) %>%
ungroup %>%
select(row, col, year = numeric) %>%
split(.$col)
.names <- # Columns of cells with text in them
.sheet %>%
filter(row >= 8, row <= 107, col >= 3, !is.na(character)) %>%
select(row, col, name = character) %>%
split(.$col)
.counts <- # Columns of cells with numbers in them
.sheet %>%
filter(row >= 8, row <= 107, col >= 3, !is.na(numeric)) %>%
select(row, col, count = numeric) %>%
split(.$col)
# Treat as small multiples, and map over them
.all <- list(.years, .names, .counts)
pmap_df(.all,
function(.year, .names, .counts) {
.counts %>%
E(.names) %>%
W(.ranks) %>%
ABOVE(.year)
}) %>%
select(-row, -col)
}
girls_names <- get_data(girls)
boys_names <- get_data(boys)
# Get the names of All Blacks
allblacks_html <- read_html("https://en.wikipedia.org/wiki/List_of_New_Zealand_national_rugby_union_players")
allblacks <-
allblacks_html %>%
html_node(xpath = "//table[@class='wikitable sortable']") %>%
html_table() %>%
mutate(Name = str_sub(Name, start = ceiling(str_length(Name)/2) + 1),
firstname = str_extract_all(Name, boundary("word")),
firstname = map_chr(firstname, ~ .[1]))
# write_csv(allblacks, "allblacks.csv")
# allblacks <- read_csv("allblacks.csv")
allblacks1954 <- filter(allblacks, `Year of\ndebut` >= 1954)
# Get the All Blacks with names that were No. 1 at some point
top_allblack_names <-
allblacks1954 %>%
inner_join(boys_names, by = c("firstname" = "name")) %>%
group_by(firstname) %>%
summarise(minrank = min(rank)) %>%
arrange(minrank) %>%
filter(minrank == 1) %>%
inner_join(allblacks1954, by = c("firstname" = "firstname")) %>%
select(firstname, Name, `Year of\ndebut`, `Test\ncaps`) %>%
mutate(`Year of\ndebut` = as.integer(`Year of\ndebut`),
`Test\ncaps` = as.integer(`Test\ncaps`)) %>%
as.data.frame
# Get the whole history of those names
top_allblacks_names_history <-
top_allblack_names %>%
distinct(firstname) %>%
inner_join(boys_names, by = c("firstname" = "name"))
# Plot
top_allblack_names %>%
filter(`Test\ncaps` >= 10) %>%
ggplot(aes(`Year of\ndebut`, 1500, label = Name, colour = firstname)) +
geom_text_repel() +
geom_point() +
geom_line(data = top_allblacks_names_history,
aes(year, count, label = NA)) +
ggtitle("All Black Baby Name Popularity", subtitle = "Data: https://smartstart.services.govt.nz/assets/files/Top-baby-names-1954-2016.xlsx") +
xlab("Year, Year of Debut") +
ylab("Number of New Zealand babies") +
scale_colour_discrete(name = "First Name") +
theme_minimal()
ggsave("all-black-babies.png")
@nacnudus
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment