Skip to content

Instantly share code, notes, and snippets.

@THargreaves
Last active December 26, 2021 19:35
Show Gist options
  • Save THargreaves/eb2d0a50d4aeaca65ed26e74182358e6 to your computer and use it in GitHub Desktop.
Save THargreaves/eb2d0a50d4aeaca65ed26e74182358e6 to your computer and use it in GitHub Desktop.
Which letter pairs that are next to each other in the alphabet actually appear together in the English language?
library(janeaustenr)
library(tidytext)
library(tidyverse)
theme_set(theme_bw())
cleaned_words <- austen_books() |>
unnest_tokens(word, text) |>
# Remove roman numerals
filter(!str_detect(word, "[IVXLCDM]+")) |>
select(-book)
# Get letters
cleaned_letters <- cleaned_words |>
mutate(word_id = row_number()) |>
unnest_tokens(letter, word, token = "characters", drop = FALSE) |>
mutate(letter = toupper(letter)) |>
filter(letter %in% LETTERS) |>
select(-word)
# Count letters and pairs
letter_counts <- cleaned_letters |>
count(letter) |>
mutate(f = n / sum(n))
independence_frequencies <- letter_counts |>
mutate(dummy = 0) |>
(\(df) full_join(df, df, by = "dummy", suffix = c("_1", "_2")))() |>
mutate(
pair = str_c(letter_1, letter_2),
f = f_1 * f_2
) |>
select(pair, f)
pair_counts <- cleaned_letters |>
group_by(word_id) |>
mutate(letter_2 = lead(letter)) |>
ungroup() |>
filter(!is.na(letter_2)) |>
select(-word_id) |>
rename(letter_1 = letter) |>
complete(letter_1, letter_2, fill = list(n = 0)) |>
mutate(pair = str_c(letter_1, letter_2)) |>
count(letter_1, letter_2, pair) |>
left_join(rename(independence_frequencies, exp_f = f), by = "pair") |>
mutate(
f = n / sum(n),
rel_f = f / exp_f
)
letter_friends <- str_c(LETTERS, lead(LETTERS))[-26]
# Plot absolute friend frequencies
pair_counts |>
filter(pair %in% letter_friends) |>
gather(f, rel_f, key = "type", value = "f") |>
mutate(type = case_when(
type == "f" ~ "Absolute Frequences",
type == "rel_f" ~ "Relative Frequencies*"
)) |>
ggplot(aes(pair, f)) +
geom_col(aes(fill = type), show.legend = FALSE) +
facet_wrap(~type, ncol = 1, scales = "free_y") +
labs(
x = "Letter Pair ('friends' only)",
y = "Frequency (out of all pairs)",
title = "Fake Letter Friends",
subtitle = str_c(
"Which letters pairs that are next to each other in the",
"alphabet actually appear together in the English language?",
sep = " "
),
caption = str_c(
"Data Source: `janeaustenr`",
"*Relative to expected frequencies if letters were independent",
sep = "\n"
)
) +
scale_x_discrete(expand = c(0, 1)) +
scale_y_continuous(expand = c(0, 0, 0.1, 0)) +
theme(
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(size = 9)
)
ggsave("fake_letter_friends.jpg")
@THargreaves
Copy link
Author

THargreaves commented Dec 26, 2021

fake_letter_friends

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment