nruigrok/conceptmap.R

## conceptmap.R
library(tidytext)
library(tidyverse)

#' Compute the KDE smoothing of the occurrence of 'target' in the 'tokens'
#'
#' @param tokens a character vector of words in the corpus
#' @param target a word to look for in the corpus
#' @param n the number of points to sample
#' @param bw the bandwidth of the smoothing
kde <- function(tokens, target, n=1000, bw=5000) {
   d = density(which(tokens == target), from=1, to=length(tokens),
          n=1000, window = "gaussian", bw = 5000)
   d$y / n * length(tokens)
}


t = tibble(text=read_file("book-war-and-peace.txt"))
tokens = unnest_tokens(t, word, text) |> mutate(offset=seq_along(word))

get_kde <- function(target, n=1000, bw=5000) {
  tibble(x=1:n, word=target, y=kde(tokens$word, target, n=n, bw=bw))
}

options(scipen = 999)
map(c("napoleon", "war", "military", "order", "general"), get_kde) |>
  list_rbind() |>
  ggplot(aes(x=x, y=y, color=word)) + geom_line() +
  scale_color_manual(values=c("napoleon"="blue", war="darkgreen", military="red", order="cyan", general="magenta"), name="") +
  theme_classic() +
  scale_y_continuous() + xlab("Word Offset") + ylab("Number of Occurrences") + ggtitle("War")


words <- tokens |> group_by(word) |> summarize(n=n()) |> arrange(-n) |> pull(word)
topwords = setdiff(words, stop_words$word) |> head(1000)


kdes <- set_names(topwords, topwords) |>
  map(function(target) kde(tokens$word, target, n=1000, bw=2000), .progress=TRUE)

rbind(kdes)
distances = do.call(rbind, kdes) |>
  dist(method='manhattan') |>
  as.matrix() |>
  as_tibble(rownames="word1") |>
  pivot_longer(-word1, names_to = "word2")

distances |>  filter(word1 == "napoleon") |> arrange(value)


library(abdiv)
combinations = combn(topwords, 2, simplify = FALSE)
head(combinations)
pivot_longer(as_tibble(d, rownames="word"), everything())
get_bc <- function(word1, word2) {
  tibble(word1=word1, word2=word2,
         dist=bray_curtis(kdes[[word1]], kdes[[word2]]))
}

distances <- map(combinations, function(words) get_bc(words[1], words[2]), .progress=TRUE) |> list_rbind()
	library(tidytext)
	library(tidyverse)

	#' Compute the KDE smoothing of the occurrence of 'target' in the 'tokens'
	#'
	#' @param tokens a character vector of words in the corpus
	#' @param target a word to look for in the corpus
	#' @param n the number of points to sample
	#' @param bw the bandwidth of the smoothing
	kde <- function(tokens, target, n=1000, bw=5000) {
	d = density(which(tokens == target), from=1, to=length(tokens),
	n=1000, window = "gaussian", bw = 5000)
	d$y / n * length(tokens)
	}


	t = tibble(text=read_file("book-war-and-peace.txt"))
	tokens = unnest_tokens(t, word, text) \|> mutate(offset=seq_along(word))

	get_kde <- function(target, n=1000, bw=5000) {
	tibble(x=1:n, word=target, y=kde(tokens$word, target, n=n, bw=bw))
	}

	options(scipen = 999)
	map(c("napoleon", "war", "military", "order", "general"), get_kde) \|>
	list_rbind() \|>
	ggplot(aes(x=x, y=y, color=word)) + geom_line() +
	scale_color_manual(values=c("napoleon"="blue", war="darkgreen", military="red", order="cyan", general="magenta"), name="") +
	theme_classic() +
	scale_y_continuous() + xlab("Word Offset") + ylab("Number of Occurrences") + ggtitle("War")



	words <- tokens \|> group_by(word) \|> summarize(n=n()) \|> arrange(-n) \|> pull(word)
	topwords = setdiff(words, stop_words$word) \|> head(1000)


	kdes <- set_names(topwords, topwords) \|>
	map(function(target) kde(tokens$word, target, n=1000, bw=2000), .progress=TRUE)

	rbind(kdes)
	distances = do.call(rbind, kdes) \|>
	dist(method='manhattan') \|>
	as.matrix() \|>
	as_tibble(rownames="word1") \|>
	pivot_longer(-word1, names_to = "word2")

	distances \|> filter(word1 == "napoleon") \|> arrange(value)


	library(abdiv)
	combinations = combn(topwords, 2, simplify = FALSE)
	head(combinations)
	pivot_longer(as_tibble(d, rownames="word"), everything())
	get_bc <- function(word1, word2) {
	tibble(word1=word1, word2=word2,
	dist=bray_curtis(kdes[[word1]], kdes[[word2]]))
	}

	distances <- map(combinations, function(words) get_bc(words[1], words[2]), .progress=TRUE) \|> list_rbind()