Skip to content

Instantly share code, notes, and snippets.

@agricolamz
Created April 16, 2024 10:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agricolamz/2b5102f7b89916fab21ba841fb1ceed5 to your computer and use it in GitHub Desktop.
Save agricolamz/2b5102f7b89916fab21ba841fb1ceed5 to your computer and use it in GitHub Desktop.
library(tidyverse)
read_lines("revizor.txt") |>
str_squish() |>
tibble(text = _) |>
filter(!str_detect(text, "^ДЕЙСТВИЕ"),
!str_detect(text, "^Явление"),
text != "") |>
mutate(id = 1:n()) ->
revizor
window <- 20
indecies <- 1:(nrow(revizor)-window)
map_dfr(indecies, function(i){
revizor |>
slice(i:(i+window-1)) |>
reframe(text = str_c(text, collapse = " ")) |>
mutate(doc_id = i)
}) ->
result
library(word2vec)
result |>
mutate(text = tolower(text)) |>
pull(text) |>
word2vec(type = "cbow") ->
#word2vec(type = "skip-gram") ->
model
result |>
mutate(text = tolower(text)) |>
doc2vec(object = model, type = "embedding") ->
embeddings
library(uwot)
embeddings |>
umap(X = _, metric = "cosine", init = "pca") ->
umap
as_tibble(umap) |>
rename(umap_v1 = V1, umap_v2 = V2) |>
mutate(id = 1:n()) |>
ggplot(aes(umap_v1, umap_v2, color = id))+
geom_path()+
geom_point()+
theme_minimal()+
labs(title = "moving embedings: Ревизор",
caption = str_glue("window: {window}"))
ggsave("revizor.png", bg = "white", height = 7, width = 7)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment