Last active
December 26, 2021 19:35
-
-
Save THargreaves/eb2d0a50d4aeaca65ed26e74182358e6 to your computer and use it in GitHub Desktop.
Which letter pairs that are next to each other in the alphabet actually appear together in the English language?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(janeaustenr) | |
library(tidytext) | |
library(tidyverse) | |
theme_set(theme_bw()) | |
cleaned_words <- austen_books() |> | |
unnest_tokens(word, text) |> | |
# Remove roman numerals | |
filter(!str_detect(word, "[IVXLCDM]+")) |> | |
select(-book) | |
# Get letters | |
cleaned_letters <- cleaned_words |> | |
mutate(word_id = row_number()) |> | |
unnest_tokens(letter, word, token = "characters", drop = FALSE) |> | |
mutate(letter = toupper(letter)) |> | |
filter(letter %in% LETTERS) |> | |
select(-word) | |
# Count letters and pairs | |
letter_counts <- cleaned_letters |> | |
count(letter) |> | |
mutate(f = n / sum(n)) | |
independence_frequencies <- letter_counts |> | |
mutate(dummy = 0) |> | |
(\(df) full_join(df, df, by = "dummy", suffix = c("_1", "_2")))() |> | |
mutate( | |
pair = str_c(letter_1, letter_2), | |
f = f_1 * f_2 | |
) |> | |
select(pair, f) | |
pair_counts <- cleaned_letters |> | |
group_by(word_id) |> | |
mutate(letter_2 = lead(letter)) |> | |
ungroup() |> | |
filter(!is.na(letter_2)) |> | |
select(-word_id) |> | |
rename(letter_1 = letter) |> | |
complete(letter_1, letter_2, fill = list(n = 0)) |> | |
mutate(pair = str_c(letter_1, letter_2)) |> | |
count(letter_1, letter_2, pair) |> | |
left_join(rename(independence_frequencies, exp_f = f), by = "pair") |> | |
mutate( | |
f = n / sum(n), | |
rel_f = f / exp_f | |
) | |
letter_friends <- str_c(LETTERS, lead(LETTERS))[-26] | |
# Plot absolute friend frequencies | |
pair_counts |> | |
filter(pair %in% letter_friends) |> | |
gather(f, rel_f, key = "type", value = "f") |> | |
mutate(type = case_when( | |
type == "f" ~ "Absolute Frequences", | |
type == "rel_f" ~ "Relative Frequencies*" | |
)) |> | |
ggplot(aes(pair, f)) + | |
geom_col(aes(fill = type), show.legend = FALSE) + | |
facet_wrap(~type, ncol = 1, scales = "free_y") + | |
labs( | |
x = "Letter Pair ('friends' only)", | |
y = "Frequency (out of all pairs)", | |
title = "Fake Letter Friends", | |
subtitle = str_c( | |
"Which letters pairs that are next to each other in the", | |
"alphabet actually appear together in the English language?", | |
sep = " " | |
), | |
caption = str_c( | |
"Data Source: `janeaustenr`", | |
"*Relative to expected frequencies if letters were independent", | |
sep = "\n" | |
) | |
) + | |
scale_x_discrete(expand = c(0, 1)) + | |
scale_y_continuous(expand = c(0, 0, 0.1, 0)) + | |
theme( | |
plot.title = element_text(face = "bold"), | |
plot.subtitle = element_text(size = 9) | |
) | |
ggsave("fake_letter_friends.jpg") |
Author
THargreaves
commented
Dec 26, 2021
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment