Created
January 23, 2018 21:59
-
-
Save EmilHvitfeldt/f69a65436915ff8b0406cdd27a194e28 to your computer and use it in GitHub Desktop.
Visualizing trigrams with the Tidyverse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(tidytext) | |
library(purrrlyr) | |
str_nth_word <- function(x, n, sep = " ") { | |
str_split(x, pattern = " ") %>% | |
map_chr(~ .x[n]) | |
} | |
sigmoid <- function(x_from, x_to, y_from, y_to, scale = 5, n = 100) { | |
x <- seq(-scale, scale, length = n) | |
y <- exp(x) / (exp(x) + 1) | |
tibble(x = (x + scale) / (scale * 2) * (x_to - x_from) + x_from, | |
y = y * (y_to - y_from) + y_from) | |
} | |
egde_lines <- function(trigram, from_word, to_word, scale = 5, n = 50, | |
x_space = 0) { | |
from_word <- from_word %>% | |
select(-n) %>% | |
set_names(c("from", "y_from", "x_from")) | |
to_word <- to_word %>% | |
select(-n) %>% | |
set_names(c("to", "y_to", "x_to")) | |
links <- crossing(from = from_word$from, | |
to = to_word$to) %>% | |
mutate(word_pair = paste(from, to), | |
number = map_dbl(word_pair, | |
~ sum(str_detect(trigram$trigram, .x)))) %>% | |
left_join(from_word, by = "from") %>% | |
left_join(to_word, by = "to") | |
links %>% | |
by_row(~ sigmoid(x_from = .x$x_from + 0.2 + x_space, | |
x_to = .x$x_to - 0.05, | |
y_from = .x$y_from, y_to = .x$y_to, | |
scale = scale, n = n) %>% | |
mutate(word_pair = .x$word_pair, | |
number = .x$number, | |
from = .x$from)) %>% | |
pull(.out) %>% | |
bind_rows() | |
} | |
n_word <- 20 | |
n_top <- 150 | |
n_gramming <- 3 | |
trigrams <- tibble(text = janeaustenr::emma) %>% | |
unnest_tokens(trigram, text, token = "ngrams", n = n_gramming) | |
start_words <- c("he", "she") | |
pattern <- str_c("^", start_words, " ", collapse = "|") | |
top_words <- trigrams %>% | |
filter(str_detect(trigram, pattern)) %>% | |
count(trigram, sort = TRUE) %>% | |
slice(seq_len(n_top)) %>% | |
pull(trigram) | |
trigrams <- trigrams %>% | |
filter(trigram %in% top_words) | |
nodes <- map_df(seq_len(n_gramming), | |
~ trigrams %>% | |
mutate(word = str_nth_word(trigram, .x)) %>% | |
count(word, sort = TRUE) %>% | |
slice(seq_len(n_word)) %>% | |
mutate(y = seq(from = n_word + 1, to = 0, | |
length.out = n() + 2)[seq_len(n()) + 1], | |
x = .x)) | |
# egdes between first and second column | |
egde1 <- egde_lines(trigram = trigrams, | |
from_word = filter(nodes, x == 1), | |
to_word = filter(nodes, x == 2), | |
n = 50) %>% | |
filter(number > 0) %>% | |
mutate(id = word_pair) | |
# Words in second colunm | |
## That start with he | |
second_word_he <- nodes %>% | |
filter(x == 2) %>% | |
select(-n) %>% | |
left_join( | |
trigrams %>% | |
filter(str_nth_word(trigram, 1) == start_words[1]) %>% | |
mutate(word = str_nth_word(trigram, 2)) %>% | |
count(word), | |
by = "word" | |
) %>% | |
replace_na(list(n = 0)) | |
## That start with she | |
second_word_she <- nodes %>% | |
filter(x == 2) %>% | |
select(-n) %>% | |
left_join( | |
trigrams %>% | |
filter(str_nth_word(trigram, 1) == start_words[2]) %>% | |
mutate(word = str_nth_word(trigram, 2)) %>% | |
count(word), | |
by = "word" | |
) %>% | |
replace_na(list(n = 0)) | |
# Words in third colunm | |
## That start with he | |
third_word_he <- nodes %>% | |
filter(x == 3) %>% | |
select(-n) %>% | |
left_join( | |
trigrams %>% | |
filter(str_nth_word(trigram, 1) == start_words[1]) %>% | |
mutate(word = str_nth_word(trigram, 3)) %>% | |
count(word), | |
by = "word" | |
) %>% | |
replace_na(list(n = 0)) | |
## That start with she | |
third_word_she <- nodes %>% | |
filter(x == 3) %>% | |
select(-n) %>% | |
left_join( | |
trigrams %>% | |
filter(str_nth_word(trigram, 1) == start_words[2]) %>% | |
mutate(word = str_nth_word(trigram, 3)) %>% | |
count(word), | |
by = "word" | |
) %>% | |
replace_na(list(n = 0)) | |
# egdes between second and third column that starts with he | |
egde2_he <- egde_lines(filter(trigrams, | |
str_detect(trigram, paste0("^", start_words[1], " "))), | |
second_word_he, third_word_he, n = 50) %>% | |
mutate(y = y + 0.05, | |
from = start_words[1], | |
id = str_c(from, word_pair, sep = " ")) %>% | |
filter(number > 0) | |
# egdes between second and third column that starts with she | |
egde2_she <- egde_lines(filter(trigrams, | |
str_detect(trigram, paste0("^", start_words[2], " "))), | |
second_word_she, third_word_she, n = 50) %>% | |
mutate(y = y - 0.05, | |
from = start_words[2], | |
id = str_c(from, word_pair, sep = " ")) %>% | |
filter(number > 0) | |
# All edges | |
edges <- bind_rows(egde1, egde2_he, egde2_she) | |
p <- nodes %>% | |
ggplot(aes(x, y, label = word, size = n)) + | |
geom_text(hjust = 0, color = "#DDDDDD") + | |
theme_void() + | |
geom_line(data = edges, | |
aes(x, y, group = id, color = from, alpha = sqrt(number)), | |
inherit.aes = FALSE) + | |
theme(plot.background = element_rect(fill = "#666666", colour = 'black'), | |
text = element_text(color = "#EEEEEE", size = 15)) + | |
guides(alpha = "none", color = "none", size = "none") + | |
xlim(c(0.9, 3.2)) + | |
scale_color_manual(values = c("#F7E720", "#7FD24E")) + | |
labs(title = " Vizualizing trigrams in Jane Austen's, Emma") + | |
scale_size(range = c(3, 8)) | |
p |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I am getting an error
Error in crossing(from = from_word$from, to = to_word$to) :
unused arguments (from = from_word$from, to = to_word$to)_
Do you know why it may be happening?