nruigrok/kde.R

## kde.R
library(tidytext)
library(tidyverse)

#' Compute the KDE smoothing of the occurrence of 'target' in the 'tokens'
#'
#' @param tokens a character vector of words in the corpus
#' @param target a word to look for in the corpus
#' @param n the number of points to sample
#' @param bw the bandwidth of the smoothing
kde <- function(tokens, target, n=1000, bw=5000) {
   d = density(which(tokens == target), from=1, to=length(tokens),
          n=1000, window = "gaussian", bw = 5000)
   d$y / n * length(tokens)
}


t = tibble(text=read_file("book-war-and-peace.txt"))
tokens = unnest_tokens(t, word, text) |> mutate(offset=seq_along(word))

get_kde <- function(target, n=1000, bw=5000) {
  tibble(x=1:n, word=target, y=kde(tokens$word, target, n=n, bw=bw))
}

options(scipen = 999)
map(c("napoleon", "war", "military", "order", "general"), get_kde) |>
  list_rbind() |>
  ggplot(aes(x=x, y=y, color=word)) + geom_line() +
  scale_color_manual(values=c("napoleon"="blue", war="darkgreen", military="red", order="cyan", general="magenta"), name="") +
  theme_classic() +
  scale_y_continuous() + xlab("Word Offset") + ylab("Number of Occurrences") + ggtitle("War")
	library(tidytext)
	library(tidyverse)

	#' Compute the KDE smoothing of the occurrence of 'target' in the 'tokens'
	#'
	#' @param tokens a character vector of words in the corpus
	#' @param target a word to look for in the corpus
	#' @param n the number of points to sample
	#' @param bw the bandwidth of the smoothing
	kde <- function(tokens, target, n=1000, bw=5000) {
	d = density(which(tokens == target), from=1, to=length(tokens),
	n=1000, window = "gaussian", bw = 5000)
	d$y / n * length(tokens)
	}


	t = tibble(text=read_file("book-war-and-peace.txt"))
	tokens = unnest_tokens(t, word, text) \|> mutate(offset=seq_along(word))

	get_kde <- function(target, n=1000, bw=5000) {
	tibble(x=1:n, word=target, y=kde(tokens$word, target, n=n, bw=bw))
	}

	options(scipen = 999)
	map(c("napoleon", "war", "military", "order", "general"), get_kde) \|>
	list_rbind() \|>
	ggplot(aes(x=x, y=y, color=word)) + geom_line() +
	scale_color_manual(values=c("napoleon"="blue", war="darkgreen", military="red", order="cyan", general="magenta"), name="") +
	theme_classic() +
	scale_y_continuous() + xlab("Word Offset") + ylab("Number of Occurrences") + ggtitle("War")