Skip to content

Instantly share code, notes, and snippets.

@dodger487
Created June 22, 2023 17:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dodger487/c28d9c6ce0bb5ac94edf853c98ad1bca to your computer and use it in GitHub Desktop.
Save dodger487/c28d9c6ce0bb5ac94edf853c98ad1bca to your computer and use it in GitHub Desktop.
library(babynames)
library(tidyverse)
library(scales)
theme_set(theme_bw())
################################################################################
## Tweets Day 1
# First thing I posted...
# Popular names, with name at year of highest popularity.
babynames %>%
group_by(sex, name) %>%
mutate(
max_n = max(n),
top_year = max(if_else(n == max_n, year, 0)),
sum_n = sum(n)
) %>%
filter(sum_n > 1e6) %>%
ungroup() %>%
ggplot(aes(x = year, y = n, group = name)) +
geom_line(alpha = 0.1) +
geom_text(aes(x = top_year, y = max_n, label = name)) +
# geom_text() +
facet_wrap(~sex, nrow=2) +
scale_y_continuous("Number births", labels = scales::label_number_si()) +
xlab("") +
ggtitle("Popular Names")
img_height <- 6
ggsave("fig/year_births_popular_names.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
# Show example of dataset
babynames %>%
head
# Overall births by sex
babynames %>%
group_by(year, sex) %>%
summarize(total = sum(n)) %>%
ggplot(aes(x = year, y = total, color = sex)) +
geom_line() +
scale_y_continuous("Number Births", labels = scales::label_number_si()) +
xlab("Year") +
ggtitle("Births per Year")
img_height <- 3
ggsave("fig/year_births_bysex.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
# Overall proportion of births by sex
babynames %>%
group_by(year, sex) %>%
summarize(total_prop = sum(prop)) %>%
ggplot(aes(x = year, y = total_prop, color = sex)) +
geom_line() +
scale_y_continuous("Proportion of all births", labels = scales::percent) +
xlab("Year") +
ggtitle("Percent of All Births in Dataset")
img_height <- 3
ggsave("fig/year_totalproportion_bysex.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
# Most popular names
babynames %>%
group_by(name, sex) %>%
summarize(total = sum(n)) %>%
arrange(-total)
# Ratio of male to female births
babynames %>%
group_by(year, sex) %>%
summarize(
total_prop = sum(prop),
total_n = sum(n),
estimated_total = total_n / total_prop
) %>%
group_by(year) %>%
arrange(sex) %>%
summarize(ratio = last(estimated_total) / first(estimated_total)) %>%
ggplot(aes(x = year, y = ratio)) +
geom_line() +
geom_hline(yintercept = 1, linetype=3) +
scale_y_continuous("Ratio of Male : Female Births") +
xlab("Year")
img_height <- 3
ggsave("fig/year_ratiomalefemale.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
# Look at estimated number of births
babynames %>%
group_by(year, sex) %>%
summarize(
total_prop = sum(prop),
total_n = sum(n),
estimated_total = total_n / total_prop
) %>%
ungroup() %>%
ggplot(aes(x = year, y = estimated_total, color = sex)) +
geom_line() +
scale_y_continuous("Estimated number of births", labels = scales::label_number_si()) +
xlab("Year")
# ggtitle("Percent of All Births in Dataset")
# Total estimated births part 2
# Combine babynames and births dataset into one df.
babynames %>%
group_by(year, sex) %>%
summarize(
total_prop = sum(prop),
total_n = sum(n),
estimated_total = total_n / total_prop
) %>%
ungroup() %>%
group_by(year) %>%
summarize(births = sum(estimated_total)) %>%
mutate(dataset = "babynames") %>%
rbind(., mutate(births, dataset = "births")) %>%
ggplot(aes(x = year, y = births, color = dataset)) +
geom_line() +
scale_y_continuous("Estimated number of births", labels = scales::label_number_si()) +
xlab("")
img_height <- 3
ggsave("fig/year_estimatedbirths_bydataset.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
################################################################################
## Tweets Day 2
groomsquad <- c('Christopher', 'Daniel', 'Adam', 'Benjamin', 'Alexander')
babynames %>%
filter(year >= 1920) %>%
filter(sex == 'M') %>%
filter(name %in% groomsquad) %>%
ggplot(aes(x = year, y = n, color = name)) +
geom_line() +
geom_vline(xintercept = 1988, linetype = 3) +
scale_y_continuous("Number of births", labels = scales::label_number_si()) +
xlab("")
img_height <- 3
ggsave("fig/year_births_groomnamees.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
# Most popular names
total_popularity <- babynames %>%
group_by(name, sex) %>%
summarize(total = sum(n)) %>%
arrange(-total) %>%
ungroup()
total_popularity %>% head(10)
TOP_N <- 10
babynames %>%
filter(year >= 1920) %>%
inner_join(.,
(total_popularity %>%
head(TOP_N) %>%
select(name, sex))) %>%
ggplot(aes(
x = year, y = n,
color = sex, group = interaction(name, sex))) +
geom_line(alpha = 0.8) +
scale_y_continuous("Number of births", labels = scales::label_number_si()) +
xlab("") +
ggtitle("Births / Year for 10 Most Popular Names")
img_height <- 3
ggsave("fig/year_births_top10names.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
babynames %>%
filter(year >= 1920) %>%
inner_join(.,
(total_popularity %>%
head(TOP_N) %>%
select(name, sex))) %>%
ggplot(aes(
x = year, y = prop,
color = sex, group = interaction(name, sex))) +
geom_line(alpha = 0.8) +
scale_y_continuous("Number of births", labels = scales::percent) +
xlab("") +
ggtitle("Births / Year for 10 Most Popular Names")
img_height <- 3
ggsave("fig/year_proportion_top10names.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
################################################################################
## Tweets Day 3
# Lazy CDF
# Isn't totally accurate because doesn't include names not in the dataset
total_popularity %>%
ggplot(aes(x = total, color = sex)) +
stat_ecdf() +
scale_x_log10("Number names, ranked by popularity", labels = scales::label_number_si()) +
scale_y_continuous("Cumulative % of births", labels = scales::percent) +
ggtitle("CDF of Name Popularity, All Time")
img_height <- 3
ggsave("fig/rankednames_cumulativepercent_gender.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
# CDFs
babynames %>%
filter(year >= 1920) %>%
filter(year %% 20 == 0) %>%
ggplot(aes(x = n, color = as.factor(year))) +
stat_ecdf() +
scale_x_log10(labels = scales::label_number_si()) +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~ sex, nrow = 2)
babynames %>%
inner_join(., births) %>%
mutate(prop2 = n / if_else(sex == "M", births * (1.05 / 2.05), births / 2.05)) %>%
arrange(-year)
# CDF of all names, mixing genders
# Using overall births number
babynames %>%
inner_join(., births) %>%
mutate(prop = n / births) %>%
arrange(-prop) %>%
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>%
NULL
# CDF of name popularity by year, discrete decades
babynames %>%
group_by(year, sex) %>%
arrange(-prop) %>%
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>%
ungroup() %>%
arrange(year) %>%
filter(year >= 1920) %>%
filter(year %% 10 == 0) %>%
arrange(-year) %>%
mutate(year = as.factor(year)) %>%
ggplot(aes(x = rank, y = cumulative_prop, color = year)) +
# geom_line(size = 1.5, alpha = 0.5) +
geom_step(size = 0.75, alpha = 0.8) +
scale_x_log10("Number names, ranked by popularity", labels = scales::label_number_si()) +
scale_y_continuous("Cumulative % of births", labels = scales::percent) +
# scale_color_brewer(palette = "RdYlBu") +
scale_color_viridis_d(option = "plasma") +
facet_wrap(~ sex, nrow = 2) +
# facet_wrap(~ sex) + # Rows is better, easier to compare F to M
theme_dark() +
# theme_gray() +
NULL
img_height <- 5
ggsave("fig/rankednames_cumulativepercent_decadediscrete_gender.png",
units = "in",
height = img_height, width = (12 / 9) * img_height)
# CDF of name popularity by year, all years
babynames %>%
group_by(year, sex) %>%
arrange(-prop) %>%
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>%
ungroup() %>%
arrange(year) %>%
filter(year >= 1920) %>%
ggplot(aes(x = rank, y = cumulative_prop, color = year, group = year)) +
geom_line(size = 1.5, alpha = 0.3) +
# geom_step(size = 0.5) +
scale_x_log10("Number names, ranked by popularity", labels = scales::label_number_si()) +
scale_y_continuous("Cumulative % of births", labels = scales::percent) +
scale_color_distiller(palette = "RdYlBu") +
facet_wrap(~ sex, nrow = 2) +
# facet_wrap(~ sex) + # Rows is better, easier to compare F to M
theme_dark() +
# theme_gray() +
NULL
img_height <- 3
ggsave("fig/rankednames_cumulativepercent_yearcontinuous_gender.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
################################################################################
## Tweets Day 4
bucketed_weight <- babynames %>%
group_by(year, sex) %>%
arrange(-n) %>%
mutate(rank = row_number(),
bucket = floor(log10(rank + .01)) + 1) %>%
ungroup() %>%
group_by(year, sex, bucket) %>%
summarize(total_prop = sum(prop)) %>%
# mutate(bucket = if_else(bucket == 5, 4, bucket)) %>%
group_by(year, sex) %>%
mutate(total_prop = if_else(bucket == 4, 1 - sum(if_else(bucket <= 3, total_prop, 0)), total_prop)) %>%
ungroup() %>%
filter(bucket < 5) %>%
mutate(bucket = 10^bucket,
bucket = as.factor(bucket),
bucket = fct_rev(bucket))
bucketed_weight %>%
tail(10)
bucketed_weight %>%
filter(year >= 1920) %>%
ggplot(aes(x = year, y = total_prop, fill = bucket)) +
geom_area() +
scale_y_continuous("% of births", labels = scales::percent) +
xlab("") +
facet_wrap(~ sex, nrow = 2) +
guides(fill=guide_legend(title="Name Rank")) +
ggtitle("% of Births in Top N Ranked Names") +
theme_minimal()
img_height <- 6
ggsave("fig/year_totalpercent_fillbypopularity.png",
units = "in",
height = img_height, width = (3 / 4) * img_height)
bucketed_weight %>%
mutate(bucket = fct_rev(bucket)) %>%
filter(year >= 1920) %>%
ggplot(aes(x = year, y = total_prop, color = sex)) +
geom_line() +
scale_y_continuous("% of births", labels = scales::percent) +
xlab("") +
facet_wrap(~bucket)
img_height <- 3
ggsave("fig/year_totalpercent_linebygender_facetbybucket.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
babynames %>%
filter(year == 2017) %>%
group_by(year, sex) %>%
arrange(-prop) %>%
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>%
ungroup() %>%
filter(rank %in% c(1, 10, 100, 1000, 10000)) %>%
arrange(rank, sex)
median_names <- babynames %>%
group_by(year, sex) %>%
arrange(-prop) %>%
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>%
ungroup() %>%
arrange(year, sex, cumulative_prop) %>%
filter(cumulative_prop >= .5) %>%
group_by(year, sex) %>%
slice_min(cumulative_prop)
median_names %>%
ggplot(aes(x = year, y = rank, color = sex)) +
geom_line() +
geom_text(
aes(x = year, y = rank, label = name),
nudge_y = 5,
data = filter(median_names, (year %% 10 == 0) || year == 2017)
) +
scale_y_continuous("Median Name Rank") +
xlab("") +
NULL
img_height <- 6
ggsave("fig/year_mediannamerank_gender_withnames.png",
units = "in",
height = img_height, width = (16 / 9) * img_height)
babynames %>%
filter(name == "Eva", sex == "F") %>%
ggplot(aes(x=year, y = n)) + geom_line()
babynames %>%
filter(name == "James", sex == "M") %>%
ggplot(aes(x=year, y = n)) + geom_line()
babynames %>%
filter(year == 2017) %>%
group_by(year, sex) %>%
arrange(-prop) %>%
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>%
ungroup() %>%
filter(rank > 0, rank < 400) %>%
filter(sex == "M") %>%
arrange(rank, sex) %>%
View
babynames %>%
filter(year == 2017) %>%
group_by(year, sex) %>%
arrange(-prop) %>%
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>%
filter(name == "Bowen")
babynames %>%
filter(sex == "M") %>%
filter(year > 1990) %>%
group_by(name) %>%
summarize(total = sum(n)) %>%
arrange(-total) %>%
View
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment