Created
October 3, 2023 18:18
-
-
Save nruigrok/5062228755c4330650e4302755fedfe7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidytext) | |
library(tidyverse) | |
#' Compute the KDE smoothing of the occurrence of 'target' in the 'tokens' | |
#' | |
#' @param tokens a character vector of words in the corpus | |
#' @param target a word to look for in the corpus | |
#' @param n the number of points to sample | |
#' @param bw the bandwidth of the smoothing | |
kde <- function(tokens, target, n=1000, bw=5000) { | |
d = density(which(tokens == target), from=1, to=length(tokens), | |
n=1000, window = "gaussian", bw = 5000) | |
d$y / n * length(tokens) | |
} | |
t = tibble(text=read_file("book-war-and-peace.txt")) | |
tokens = unnest_tokens(t, word, text) |> mutate(offset=seq_along(word)) | |
get_kde <- function(target, n=1000, bw=5000) { | |
tibble(x=1:n, word=target, y=kde(tokens$word, target, n=n, bw=bw)) | |
} | |
options(scipen = 999) | |
map(c("napoleon", "war", "military", "order", "general"), get_kde) |> | |
list_rbind() |> | |
ggplot(aes(x=x, y=y, color=word)) + geom_line() + | |
scale_color_manual(values=c("napoleon"="blue", war="darkgreen", military="red", order="cyan", general="magenta"), name="") + | |
theme_classic() + | |
scale_y_continuous() + xlab("Word Offset") + ylab("Number of Occurrences") + ggtitle("War") | |
words <- tokens |> group_by(word) |> summarize(n=n()) |> arrange(-n) |> pull(word) | |
topwords = setdiff(words, stop_words$word) |> head(1000) | |
kdes <- set_names(topwords, topwords) |> | |
map(function(target) kde(tokens$word, target, n=1000, bw=2000), .progress=TRUE) | |
rbind(kdes) | |
distances = do.call(rbind, kdes) |> | |
dist(method='manhattan') |> | |
as.matrix() |> | |
as_tibble(rownames="word1") |> | |
pivot_longer(-word1, names_to = "word2") | |
distances |> filter(word1 == "napoleon") |> arrange(value) | |
library(abdiv) | |
combinations = combn(topwords, 2, simplify = FALSE) | |
head(combinations) | |
pivot_longer(as_tibble(d, rownames="word"), everything()) | |
get_bc <- function(word1, word2) { | |
tibble(word1=word1, word2=word2, | |
dist=bray_curtis(kdes[[word1]], kdes[[word2]])) | |
} | |
distances <- map(combinations, function(words) get_bc(words[1], words[2]), .progress=TRUE) |> list_rbind() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment