Skip to content

Instantly share code, notes, and snippets.

@nruigrok
Created October 3, 2023 09:09
Show Gist options
  • Save nruigrok/c5071d3d71bea38708726dd30e0fa48f to your computer and use it in GitHub Desktop.
Save nruigrok/c5071d3d71bea38708726dd30e0fa48f to your computer and use it in GitHub Desktop.
library(tidytext)
library(tidyverse)
#' Compute the KDE smoothing of the occurrence of 'target' in the 'tokens'
#'
#' @param tokens a character vector of words in the corpus
#' @param target a word to look for in the corpus
#' @param n the number of points to sample
#' @param bw the bandwidth of the smoothing
kde <- function(tokens, target, n=1000, bw=5000) {
d = density(which(tokens == target), from=1, to=length(tokens),
n=1000, window = "gaussian", bw = 5000)
d$y / n * length(tokens)
}
t = tibble(text=read_file("book-war-and-peace.txt"))
tokens = unnest_tokens(t, word, text) |> mutate(offset=seq_along(word))
get_kde <- function(target, n=1000, bw=5000) {
tibble(x=1:n, word=target, y=kde(tokens$word, target, n=n, bw=bw))
}
options(scipen = 999)
map(c("napoleon", "war", "military", "order", "general"), get_kde) |>
list_rbind() |>
ggplot(aes(x=x, y=y, color=word)) + geom_line() +
scale_color_manual(values=c("napoleon"="blue", war="darkgreen", military="red", order="cyan", general="magenta"), name="") +
theme_classic() +
scale_y_continuous() + xlab("Word Offset") + ylab("Number of Occurrences") + ggtitle("War")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment