Skip to content

Instantly share code, notes, and snippets.

@vanatteveldt
Created September 16, 2022 09:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vanatteveldt/86ad9486bd0f25738379db97aadca981 to your computer and use it in GitHub Desktop.
Save vanatteveldt/86ad9486bd0f25738379db97aadca981 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(readtext)
library(quanteda)
library(topicmodels)
d = readtext("/home/wva/ccsbook/chapter*/*.tex")
dfm = corpus(d) |> corpus_reshape(to="paragraphs") |> tokens(split_tags=F, remove_symbols = T, remove_punct = T) |> tokens_remove("\\W", valuetype="regex") |> dfm()
set.seed(123)
m = dfm |> dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop") |>
dfm_remove(stopwords()) |>
dfm_select(min_nchar=3) |>
convert(to="topicmodels") |>
LDA(k=20)
topicmodels::terms(m, 20) |> View()
terms = tibble(doc=factor(m@wordassignments$i, labels= m@documents),
term=factor(m@wordassignments$j, labels = m@terms),
topic=m@wordassignments$v) |>
group_by(term, topic) |>
summarize(n=n(), .groups="drop_last") |>
mutate(f=n/sum(n)) |>
ungroup() |>
arrange(term)
xyz = terms |>
select(-n) |>
pivot_wider(values_from = f, names_from=topic, values_fill = 0) |>
select(term, x=`18`, y=`6`, z=`13`) |>
filter(x+y+z > 0)
txy = xyz |>
mutate(x=.005 + .99*x, y=.005 + .99*y, z=.005 + .99*z) |>
column_to_rownames("term") |>
ggtern::tlr2xy(ggtern::coord_tern()) |>
rownames_to_column("term") |>
rename(tx=x, ty=y) |>
as_tibble()
t = terms |>
arrange(-n) |>
group_by(term) |>
summarize(topic=topic[1], n=sum(n)) |>
inner_join(xyz) |>
inner_join(txy)
maxfreq = t |> filter(!term %in% c("chapter", "section")) |> arrange(-n)
mix = t |> filter(!term %in% c("chapter", "section")) |> mutate(mix=pmax(pmin(x,z), pmin(x,y), pmin(y,z)), n2=n*mix) |> arrange(-n2)
pdf("/tmp/test.pdf", width=8, height=8)
bind_rows(head(mix, 150), head(maxfreq, 25)) |> select(-mix, -n2) |> unique() |>
arrange(-n) |>
mutate(t=x+y+z, col = rgb(x/t, y/t, z/t)) |>
ggplot(aes(x=tx, y=ty, label=term, size=n, color=col)) +
ggwordcloud::geom_text_wordcloud_area(shape="triangle-upright") +
scale_size_continuous(range=c(.5,10))+
scale_color_identity() + theme_void() + theme(legend.position="none")
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment