Skip to content

Instantly share code, notes, and snippets.

@nruigrok
Created October 3, 2023 18:18
Show Gist options
  • Save nruigrok/5062228755c4330650e4302755fedfe7 to your computer and use it in GitHub Desktop.
Save nruigrok/5062228755c4330650e4302755fedfe7 to your computer and use it in GitHub Desktop.
library(tidytext)
library(tidyverse)
#' Compute the KDE smoothing of the occurrence of 'target' in the 'tokens'
#'
#' @param tokens a character vector of words in the corpus
#' @param target a word to look for in the corpus
#' @param n the number of points to sample
#' @param bw the bandwidth of the smoothing
kde <- function(tokens, target, n=1000, bw=5000) {
d = density(which(tokens == target), from=1, to=length(tokens),
n=1000, window = "gaussian", bw = 5000)
d$y / n * length(tokens)
}
t = tibble(text=read_file("book-war-and-peace.txt"))
tokens = unnest_tokens(t, word, text) |> mutate(offset=seq_along(word))
get_kde <- function(target, n=1000, bw=5000) {
tibble(x=1:n, word=target, y=kde(tokens$word, target, n=n, bw=bw))
}
options(scipen = 999)
map(c("napoleon", "war", "military", "order", "general"), get_kde) |>
list_rbind() |>
ggplot(aes(x=x, y=y, color=word)) + geom_line() +
scale_color_manual(values=c("napoleon"="blue", war="darkgreen", military="red", order="cyan", general="magenta"), name="") +
theme_classic() +
scale_y_continuous() + xlab("Word Offset") + ylab("Number of Occurrences") + ggtitle("War")
words <- tokens |> group_by(word) |> summarize(n=n()) |> arrange(-n) |> pull(word)
topwords = setdiff(words, stop_words$word) |> head(1000)
kdes <- set_names(topwords, topwords) |>
map(function(target) kde(tokens$word, target, n=1000, bw=2000), .progress=TRUE)
rbind(kdes)
distances = do.call(rbind, kdes) |>
dist(method='manhattan') |>
as.matrix() |>
as_tibble(rownames="word1") |>
pivot_longer(-word1, names_to = "word2")
distances |> filter(word1 == "napoleon") |> arrange(value)
library(abdiv)
combinations = combn(topwords, 2, simplify = FALSE)
head(combinations)
pivot_longer(as_tibble(d, rownames="word"), everything())
get_bc <- function(word1, word2) {
tibble(word1=word1, word2=word2,
dist=bray_curtis(kdes[[word1]], kdes[[word2]]))
}
distances <- map(combinations, function(words) get_bc(words[1], words[2]), .progress=TRUE) |> list_rbind()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment