Created
June 22, 2023 17:49
-
-
Save dodger487/c28d9c6ce0bb5ac94edf853c98ad1bca to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(babynames) | |
library(tidyverse) | |
library(scales) | |
theme_set(theme_bw()) | |
################################################################################ | |
## Tweets Day 1 | |
# First thing I posted... | |
# Popular names, with name at year of highest popularity. | |
babynames %>% | |
group_by(sex, name) %>% | |
mutate( | |
max_n = max(n), | |
top_year = max(if_else(n == max_n, year, 0)), | |
sum_n = sum(n) | |
) %>% | |
filter(sum_n > 1e6) %>% | |
ungroup() %>% | |
ggplot(aes(x = year, y = n, group = name)) + | |
geom_line(alpha = 0.1) + | |
geom_text(aes(x = top_year, y = max_n, label = name)) + | |
# geom_text() + | |
facet_wrap(~sex, nrow=2) + | |
scale_y_continuous("Number births", labels = scales::label_number_si()) + | |
xlab("") + | |
ggtitle("Popular Names") | |
img_height <- 6 | |
ggsave("fig/year_births_popular_names.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
# Show example of dataset | |
babynames %>% | |
head | |
# Overall births by sex | |
babynames %>% | |
group_by(year, sex) %>% | |
summarize(total = sum(n)) %>% | |
ggplot(aes(x = year, y = total, color = sex)) + | |
geom_line() + | |
scale_y_continuous("Number Births", labels = scales::label_number_si()) + | |
xlab("Year") + | |
ggtitle("Births per Year") | |
img_height <- 3 | |
ggsave("fig/year_births_bysex.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
# Overall proportion of births by sex | |
babynames %>% | |
group_by(year, sex) %>% | |
summarize(total_prop = sum(prop)) %>% | |
ggplot(aes(x = year, y = total_prop, color = sex)) + | |
geom_line() + | |
scale_y_continuous("Proportion of all births", labels = scales::percent) + | |
xlab("Year") + | |
ggtitle("Percent of All Births in Dataset") | |
img_height <- 3 | |
ggsave("fig/year_totalproportion_bysex.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
# Most popular names | |
babynames %>% | |
group_by(name, sex) %>% | |
summarize(total = sum(n)) %>% | |
arrange(-total) | |
# Ratio of male to female births | |
babynames %>% | |
group_by(year, sex) %>% | |
summarize( | |
total_prop = sum(prop), | |
total_n = sum(n), | |
estimated_total = total_n / total_prop | |
) %>% | |
group_by(year) %>% | |
arrange(sex) %>% | |
summarize(ratio = last(estimated_total) / first(estimated_total)) %>% | |
ggplot(aes(x = year, y = ratio)) + | |
geom_line() + | |
geom_hline(yintercept = 1, linetype=3) + | |
scale_y_continuous("Ratio of Male : Female Births") + | |
xlab("Year") | |
img_height <- 3 | |
ggsave("fig/year_ratiomalefemale.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
# Look at estimated number of births | |
babynames %>% | |
group_by(year, sex) %>% | |
summarize( | |
total_prop = sum(prop), | |
total_n = sum(n), | |
estimated_total = total_n / total_prop | |
) %>% | |
ungroup() %>% | |
ggplot(aes(x = year, y = estimated_total, color = sex)) + | |
geom_line() + | |
scale_y_continuous("Estimated number of births", labels = scales::label_number_si()) + | |
xlab("Year") | |
# ggtitle("Percent of All Births in Dataset") | |
# Total estimated births part 2 | |
# Combine babynames and births dataset into one df. | |
babynames %>% | |
group_by(year, sex) %>% | |
summarize( | |
total_prop = sum(prop), | |
total_n = sum(n), | |
estimated_total = total_n / total_prop | |
) %>% | |
ungroup() %>% | |
group_by(year) %>% | |
summarize(births = sum(estimated_total)) %>% | |
mutate(dataset = "babynames") %>% | |
rbind(., mutate(births, dataset = "births")) %>% | |
ggplot(aes(x = year, y = births, color = dataset)) + | |
geom_line() + | |
scale_y_continuous("Estimated number of births", labels = scales::label_number_si()) + | |
xlab("") | |
img_height <- 3 | |
ggsave("fig/year_estimatedbirths_bydataset.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
################################################################################ | |
## Tweets Day 2 | |
groomsquad <- c('Christopher', 'Daniel', 'Adam', 'Benjamin', 'Alexander') | |
babynames %>% | |
filter(year >= 1920) %>% | |
filter(sex == 'M') %>% | |
filter(name %in% groomsquad) %>% | |
ggplot(aes(x = year, y = n, color = name)) + | |
geom_line() + | |
geom_vline(xintercept = 1988, linetype = 3) + | |
scale_y_continuous("Number of births", labels = scales::label_number_si()) + | |
xlab("") | |
img_height <- 3 | |
ggsave("fig/year_births_groomnamees.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
# Most popular names | |
total_popularity <- babynames %>% | |
group_by(name, sex) %>% | |
summarize(total = sum(n)) %>% | |
arrange(-total) %>% | |
ungroup() | |
total_popularity %>% head(10) | |
TOP_N <- 10 | |
babynames %>% | |
filter(year >= 1920) %>% | |
inner_join(., | |
(total_popularity %>% | |
head(TOP_N) %>% | |
select(name, sex))) %>% | |
ggplot(aes( | |
x = year, y = n, | |
color = sex, group = interaction(name, sex))) + | |
geom_line(alpha = 0.8) + | |
scale_y_continuous("Number of births", labels = scales::label_number_si()) + | |
xlab("") + | |
ggtitle("Births / Year for 10 Most Popular Names") | |
img_height <- 3 | |
ggsave("fig/year_births_top10names.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
babynames %>% | |
filter(year >= 1920) %>% | |
inner_join(., | |
(total_popularity %>% | |
head(TOP_N) %>% | |
select(name, sex))) %>% | |
ggplot(aes( | |
x = year, y = prop, | |
color = sex, group = interaction(name, sex))) + | |
geom_line(alpha = 0.8) + | |
scale_y_continuous("Number of births", labels = scales::percent) + | |
xlab("") + | |
ggtitle("Births / Year for 10 Most Popular Names") | |
img_height <- 3 | |
ggsave("fig/year_proportion_top10names.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
################################################################################ | |
## Tweets Day 3 | |
# Lazy CDF | |
# Isn't totally accurate because doesn't include names not in the dataset | |
total_popularity %>% | |
ggplot(aes(x = total, color = sex)) + | |
stat_ecdf() + | |
scale_x_log10("Number names, ranked by popularity", labels = scales::label_number_si()) + | |
scale_y_continuous("Cumulative % of births", labels = scales::percent) + | |
ggtitle("CDF of Name Popularity, All Time") | |
img_height <- 3 | |
ggsave("fig/rankednames_cumulativepercent_gender.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
# CDFs | |
babynames %>% | |
filter(year >= 1920) %>% | |
filter(year %% 20 == 0) %>% | |
ggplot(aes(x = n, color = as.factor(year))) + | |
stat_ecdf() + | |
scale_x_log10(labels = scales::label_number_si()) + | |
scale_y_continuous(labels = scales::percent) + | |
facet_wrap(~ sex, nrow = 2) | |
babynames %>% | |
inner_join(., births) %>% | |
mutate(prop2 = n / if_else(sex == "M", births * (1.05 / 2.05), births / 2.05)) %>% | |
arrange(-year) | |
# CDF of all names, mixing genders | |
# Using overall births number | |
babynames %>% | |
inner_join(., births) %>% | |
mutate(prop = n / births) %>% | |
arrange(-prop) %>% | |
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>% | |
NULL | |
# CDF of name popularity by year, discrete decades | |
babynames %>% | |
group_by(year, sex) %>% | |
arrange(-prop) %>% | |
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>% | |
ungroup() %>% | |
arrange(year) %>% | |
filter(year >= 1920) %>% | |
filter(year %% 10 == 0) %>% | |
arrange(-year) %>% | |
mutate(year = as.factor(year)) %>% | |
ggplot(aes(x = rank, y = cumulative_prop, color = year)) + | |
# geom_line(size = 1.5, alpha = 0.5) + | |
geom_step(size = 0.75, alpha = 0.8) + | |
scale_x_log10("Number names, ranked by popularity", labels = scales::label_number_si()) + | |
scale_y_continuous("Cumulative % of births", labels = scales::percent) + | |
# scale_color_brewer(palette = "RdYlBu") + | |
scale_color_viridis_d(option = "plasma") + | |
facet_wrap(~ sex, nrow = 2) + | |
# facet_wrap(~ sex) + # Rows is better, easier to compare F to M | |
theme_dark() + | |
# theme_gray() + | |
NULL | |
img_height <- 5 | |
ggsave("fig/rankednames_cumulativepercent_decadediscrete_gender.png", | |
units = "in", | |
height = img_height, width = (12 / 9) * img_height) | |
# CDF of name popularity by year, all years | |
babynames %>% | |
group_by(year, sex) %>% | |
arrange(-prop) %>% | |
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>% | |
ungroup() %>% | |
arrange(year) %>% | |
filter(year >= 1920) %>% | |
ggplot(aes(x = rank, y = cumulative_prop, color = year, group = year)) + | |
geom_line(size = 1.5, alpha = 0.3) + | |
# geom_step(size = 0.5) + | |
scale_x_log10("Number names, ranked by popularity", labels = scales::label_number_si()) + | |
scale_y_continuous("Cumulative % of births", labels = scales::percent) + | |
scale_color_distiller(palette = "RdYlBu") + | |
facet_wrap(~ sex, nrow = 2) + | |
# facet_wrap(~ sex) + # Rows is better, easier to compare F to M | |
theme_dark() + | |
# theme_gray() + | |
NULL | |
img_height <- 3 | |
ggsave("fig/rankednames_cumulativepercent_yearcontinuous_gender.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
################################################################################ | |
## Tweets Day 4 | |
bucketed_weight <- babynames %>% | |
group_by(year, sex) %>% | |
arrange(-n) %>% | |
mutate(rank = row_number(), | |
bucket = floor(log10(rank + .01)) + 1) %>% | |
ungroup() %>% | |
group_by(year, sex, bucket) %>% | |
summarize(total_prop = sum(prop)) %>% | |
# mutate(bucket = if_else(bucket == 5, 4, bucket)) %>% | |
group_by(year, sex) %>% | |
mutate(total_prop = if_else(bucket == 4, 1 - sum(if_else(bucket <= 3, total_prop, 0)), total_prop)) %>% | |
ungroup() %>% | |
filter(bucket < 5) %>% | |
mutate(bucket = 10^bucket, | |
bucket = as.factor(bucket), | |
bucket = fct_rev(bucket)) | |
bucketed_weight %>% | |
tail(10) | |
bucketed_weight %>% | |
filter(year >= 1920) %>% | |
ggplot(aes(x = year, y = total_prop, fill = bucket)) + | |
geom_area() + | |
scale_y_continuous("% of births", labels = scales::percent) + | |
xlab("") + | |
facet_wrap(~ sex, nrow = 2) + | |
guides(fill=guide_legend(title="Name Rank")) + | |
ggtitle("% of Births in Top N Ranked Names") + | |
theme_minimal() | |
img_height <- 6 | |
ggsave("fig/year_totalpercent_fillbypopularity.png", | |
units = "in", | |
height = img_height, width = (3 / 4) * img_height) | |
bucketed_weight %>% | |
mutate(bucket = fct_rev(bucket)) %>% | |
filter(year >= 1920) %>% | |
ggplot(aes(x = year, y = total_prop, color = sex)) + | |
geom_line() + | |
scale_y_continuous("% of births", labels = scales::percent) + | |
xlab("") + | |
facet_wrap(~bucket) | |
img_height <- 3 | |
ggsave("fig/year_totalpercent_linebygender_facetbybucket.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
babynames %>% | |
filter(year == 2017) %>% | |
group_by(year, sex) %>% | |
arrange(-prop) %>% | |
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>% | |
ungroup() %>% | |
filter(rank %in% c(1, 10, 100, 1000, 10000)) %>% | |
arrange(rank, sex) | |
median_names <- babynames %>% | |
group_by(year, sex) %>% | |
arrange(-prop) %>% | |
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>% | |
ungroup() %>% | |
arrange(year, sex, cumulative_prop) %>% | |
filter(cumulative_prop >= .5) %>% | |
group_by(year, sex) %>% | |
slice_min(cumulative_prop) | |
median_names %>% | |
ggplot(aes(x = year, y = rank, color = sex)) + | |
geom_line() + | |
geom_text( | |
aes(x = year, y = rank, label = name), | |
nudge_y = 5, | |
data = filter(median_names, (year %% 10 == 0) || year == 2017) | |
) + | |
scale_y_continuous("Median Name Rank") + | |
xlab("") + | |
NULL | |
img_height <- 6 | |
ggsave("fig/year_mediannamerank_gender_withnames.png", | |
units = "in", | |
height = img_height, width = (16 / 9) * img_height) | |
babynames %>% | |
filter(name == "Eva", sex == "F") %>% | |
ggplot(aes(x=year, y = n)) + geom_line() | |
babynames %>% | |
filter(name == "James", sex == "M") %>% | |
ggplot(aes(x=year, y = n)) + geom_line() | |
babynames %>% | |
filter(year == 2017) %>% | |
group_by(year, sex) %>% | |
arrange(-prop) %>% | |
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>% | |
ungroup() %>% | |
filter(rank > 0, rank < 400) %>% | |
filter(sex == "M") %>% | |
arrange(rank, sex) %>% | |
View | |
babynames %>% | |
filter(year == 2017) %>% | |
group_by(year, sex) %>% | |
arrange(-prop) %>% | |
mutate(cumulative_prop = cumsum(prop), rank = row_number()) %>% | |
filter(name == "Bowen") | |
babynames %>% | |
filter(sex == "M") %>% | |
filter(year > 1990) %>% | |
group_by(name) %>% | |
summarize(total = sum(n)) %>% | |
arrange(-total) %>% | |
View |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment