trinker/topicmodeling_bit.R

## topicmodeling_bit.R
if (!require("pacman")) install.packages("pacman")
pacman::p_load(udpipe, BTM)
data("brussels_reviews_anno", package = "udpipe")

## Get and load the udpipe model
engmod <- udpipe_download_model(language = "english", udpipe_model_repo = "bnosac/udpipe.models.ud")
ud_engmod <- udpipe_load_model(engmod$file_model)

## Annotate the text data and merge back together
nr <- nrow(sentimentr::presidential_debates_2012)
anno_dat <- udpipe_annotate(
        ud_engmod,
        x = sentimentr::presidential_debates_2012$dialogue,
        doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(nr), nchar(nr), '0')  )
    ) %>%
    as.data.frame() %>%
    as_tibble() %>%
    left_join(
        sentimentr::presidential_debates_2012 %>%
            mutate(doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(n()), nchar(n()), '0')  )),
        by = 'doc_id'
    )


## Taking only noun lemmas of english data
x <- anno_dat %>%
    dplyr::filter(xpos %in% c("NN", "NNP", "NNS")) %>%
    dplyr::select(doc_id, lemma)


## Building the model
set.seed(321)
model  <- BTM(x, k = 30, beta = 0.01, iter = 1000, trace = 100)

## Inspect the model - topic frequency + conditional term probabilities
model$theta


topicterms <- terms(model, top_n = 20)
topicterms


scores <- predict(model, newdata = x)


topics <- as.data.frame(scores) %>%
    rownames_to_column('doc_id') %>%
    as_tibble() %>%
    gather(Topic, Prob, -doc_id) %>%
    group_by(doc_id) %>%
    filter(near(Prob, max(Prob))) %>%
    arrange(doc_id)

tops <- topics %>%
    left_join(
        sentimentr::presidential_debates_2012 %>%
            mutate(doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(n()), nchar(n()), '0')  )),
        by = 'doc_id'
    ) %>%
    {split(.$dialogue, .$Topic)}
	if (!require("pacman")) install.packages("pacman")
	pacman::p_load(udpipe, BTM)
	data("brussels_reviews_anno", package = "udpipe")

	## Get and load the udpipe model
	engmod <- udpipe_download_model(language = "english", udpipe_model_repo = "bnosac/udpipe.models.ud")
	ud_engmod <- udpipe_load_model(engmod$file_model)

	## Annotate the text data and merge back together
	nr <- nrow(sentimentr::presidential_debates_2012)
	anno_dat <- udpipe_annotate(
	ud_engmod,
	x = sentimentr::presidential_debates_2012$dialogue,
	doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(nr), nchar(nr), '0') )
	) %>%
	as.data.frame() %>%
	as_tibble() %>%
	left_join(
	sentimentr::presidential_debates_2012 %>%
	mutate(doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(n()), nchar(n()), '0') )),
	by = 'doc_id'
	)


	## Taking only noun lemmas of english data
	x <- anno_dat %>%
	dplyr::filter(xpos %in% c("NN", "NNP", "NNS")) %>%
	dplyr::select(doc_id, lemma)


	## Building the model
	set.seed(321)
	model <- BTM(x, k = 30, beta = 0.01, iter = 1000, trace = 100)

	## Inspect the model - topic frequency + conditional term probabilities
	model$theta


	topicterms <- terms(model, top_n = 20)
	topicterms


	scores <- predict(model, newdata = x)


	topics <- as.data.frame(scores) %>%
	rownames_to_column('doc_id') %>%
	as_tibble() %>%
	gather(Topic, Prob, -doc_id) %>%
	group_by(doc_id) %>%
	filter(near(Prob, max(Prob))) %>%
	arrange(doc_id)

	tops <- topics %>%
	left_join(
	sentimentr::presidential_debates_2012 %>%
	mutate(doc_id = paste0('doc_', stringi::stri_pad_left(seq_len(n()), nchar(n()), '0') )),
	by = 'doc_id'
	) %>%
	{split(.$dialogue, .$Topic)}