Skip to content

Instantly share code, notes, and snippets.

Created July 7, 2021 09:02
Show Gist options
  • Save gongcastro/34b7131ed98a463da9d8d4c88a77e03b to your computer and use it in GitHub Desktop.
Save gongcastro/34b7131ed98a463da9d8d4c88a77e03b to your computer and use it in GitHub Desktop.
Custom function to extract lexical frequencies from the CHILDES corpora.
# extract lexical frequencies from CHILDES
# you may need to install the following packages:
# install.packages(c("dplyr", "stringr", "tidyr", "chidesr"))
get_childes_frequency <- function(
token, # word(s) form to look up, e.g. c("table", "mesa")
languages = c("cat", "spa"), # languages in which to look up the word form
... # other arguments (see ?childesr::get_speaker_statistics)
# get total number of tokens in each language
total_counts <- get_speaker_statistics(...) %>%
filter(str_detect(language, paste(languages, collapse = "|"))) %>%
group_by(language) %>%
summarise(num_tokens = sum(num_tokens), .groups = "drop") %>%
mutate(language = str_split(language, " ")) %>%
unnest(cols = language) %>%
group_by(language) %>%
summarise(n = sum(num_tokens, na.rm = TRUE), .groups = "drop")
# absolute frequency (raw counts)
freq_counts <- get_tokens(role = "target_child", token = token, language = languages) %>%
mutate(gloss = str_to_lower(gloss)) %>%
filter(str_detect(language, paste(languages, collapse = "|"))) %>%
count(gloss, language) %>%
mutate(language = str_split(language, " ")) %>%
unnest(language) %>%
group_by(language, gloss) %>%
summarise(freq_counts = sum(n), .groups = "drop") %>%
# relative frequency (counts per million)
freq_million <- freq_counts %>%
left_join(total_counts, by = "language") %>%
freq_per_million = freq_counts/n*1e6,
freq_zipf = log10(freq_per_million)+3
) %>%
rename(word = gloss, test_language = language) %>%
select(word, test_language, starts_with("freq_"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment