Skip to content

Instantly share code, notes, and snippets.

@trinker
Created February 13, 2019 17:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save trinker/caf59c729fb034cf318b12c090633aca to your computer and use it in GitHub Desktop.
Save trinker/caf59c729fb034cf318b12c090633aca to your computer and use it in GitHub Desktop.
Bit Topic Modeling
if (!require("pacman")) install.packages("pacman")
pacman::p_load(udpipe, BTM)
data("brussels_reviews_anno", package = "udpipe")
## Get and load the udpipe model
engmod <- udpipe_download_model(language = "english", udpipe_model_repo = "bnosac/udpipe.models.ud")
ud_engmod <- udpipe_load_model(engmod$file_model)
## Annotate the text data and merge back together
nr <- nrow(sentimentr::presidential_debates_2012)
anno_dat <- udpipe_annotate(
ud_engmod,
x = sentimentr::presidential_debates_2012$dialogue,
doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(nr), nchar(nr), '0') )
) %>%
as.data.frame() %>%
as_tibble() %>%
left_join(
sentimentr::presidential_debates_2012 %>%
mutate(doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(n()), nchar(n()), '0') )),
by = 'doc_id'
)
## Taking only noun lemmas of english data
x <- anno_dat %>%
dplyr::filter(xpos %in% c("NN", "NNP", "NNS")) %>%
dplyr::select(doc_id, lemma)
## Building the model
set.seed(321)
model <- BTM(x, k = 30, beta = 0.01, iter = 1000, trace = 100)
## Inspect the model - topic frequency + conditional term probabilities
model$theta
topicterms <- terms(model, top_n = 20)
topicterms
scores <- predict(model, newdata = x)
topics <- as.data.frame(scores) %>%
rownames_to_column('doc_id') %>%
as_tibble() %>%
gather(Topic, Prob, -doc_id) %>%
group_by(doc_id) %>%
filter(near(Prob, max(Prob))) %>%
arrange(doc_id)
tops <- topics %>%
left_join(
sentimentr::presidential_debates_2012 %>%
mutate(doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(n()), nchar(n()), '0') )),
by = 'doc_id'
) %>%
{split(.$dialogue, .$Topic)}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment