Created
October 3, 2023 09:09
-
-
Save nruigrok/c5071d3d71bea38708726dd30e0fa48f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidytext) | |
library(tidyverse) | |
#' Compute the KDE smoothing of the occurrence of 'target' in the 'tokens' | |
#' | |
#' @param tokens a character vector of words in the corpus | |
#' @param target a word to look for in the corpus | |
#' @param n the number of points to sample | |
#' @param bw the bandwidth of the smoothing | |
kde <- function(tokens, target, n=1000, bw=5000) { | |
d = density(which(tokens == target), from=1, to=length(tokens), | |
n=1000, window = "gaussian", bw = 5000) | |
d$y / n * length(tokens) | |
} | |
t = tibble(text=read_file("book-war-and-peace.txt")) | |
tokens = unnest_tokens(t, word, text) |> mutate(offset=seq_along(word)) | |
get_kde <- function(target, n=1000, bw=5000) { | |
tibble(x=1:n, word=target, y=kde(tokens$word, target, n=n, bw=bw)) | |
} | |
options(scipen = 999) | |
map(c("napoleon", "war", "military", "order", "general"), get_kde) |> | |
list_rbind() |> | |
ggplot(aes(x=x, y=y, color=word)) + geom_line() + | |
scale_color_manual(values=c("napoleon"="blue", war="darkgreen", military="red", order="cyan", general="magenta"), name="") + | |
theme_classic() + | |
scale_y_continuous() + xlab("Word Offset") + ylab("Number of Occurrences") + ggtitle("War") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment