Bit Topic Modeling
if (!require("pacman")) install.packages("pacman")
pacman::p_load(udpipe, BTM)
data("brussels_reviews_anno", package = "udpipe")
## Get and load the udpipe model
engmod <- udpipe_download_model(language = "english", udpipe_model_repo = "bnosac/udpipe.models.ud")
ud_engmod <- udpipe_load_model(engmod$file_model)
## Annotate the text data and merge back together
nr <- nrow(sentimentr::presidential_debates_2012)
anno_dat <- udpipe_annotate(
x = sentimentr::presidential_debates_2012$dialogue,
doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(nr), nchar(nr), '0') )
) %>% %>%
as_tibble() %>%
sentimentr::presidential_debates_2012 %>%
mutate(doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(n()), nchar(n()), '0') )),
by = 'doc_id'
## Taking only noun lemmas of english data
x <- anno_dat %>%
dplyr::filter(xpos %in% c("NN", "NNP", "NNS")) %>%
dplyr::select(doc_id, lemma)
## Building the model
model <- BTM(x, k = 30, beta = 0.01, iter = 1000, trace = 100)
## Inspect the model - topic frequency + conditional term probabilities
topicterms <- terms(model, top_n = 20)
scores <- predict(model, newdata = x)
topics <- %>%
rownames_to_column('doc_id') %>%
as_tibble() %>%
gather(Topic, Prob, -doc_id) %>%
group_by(doc_id) %>%
filter(near(Prob, max(Prob))) %>%
tops <- topics %>%
sentimentr::presidential_debates_2012 %>%
mutate(doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(n()), nchar(n()), '0') )),
by = 'doc_id'
) %>%
{split(.$dialogue, .$Topic)}
