Skip to content

Instantly share code, notes, and snippets.

@achetverikov
Created February 2, 2022 11:43
Show Gist options
  • Save achetverikov/c63b307dcd1042db4fad7697ceffe01a to your computer and use it in GitHub Desktop.
Save achetverikov/c63b307dcd1042db4fad7697ceffe01a to your computer and use it in GitHub Desktop.
Topic modeling of ECVP 21 abstracts
library(data.table)
library(xml2)
library(stm)
library(quanteda)
library(gofastr)
library(stopwords)
library(spacyr)
library(quanteda)
library(quanteda.textplots)
library(Hmisc)
abstracts_xml<-read_xml( 'https://ecvp2021.org/programme/all.xml', quiet = F)
abstracts_list <- as_list(abstracts_xml)
abstract_texts <- lapply(abstracts_list$abstracts, \(x) x$text[[1]][[1]])
abstract_texts <- unname(unlist(abstract_texts))
my_stopwords <- prep_stopwords(c(stopwords('en'),stopwords('SMART'),'group','participant','subject','test','result','show','use','display','found','find','demonstrate','human','differ','study','studied','perform', 'test','experiment','correlat','one','two','row','three','csv','can','task','participants','results','presented','performance','perception','different','response','responses','condition','conditions','observers','suggest','suggests','informed','compared','measured','experiment','processed','ms','e.g.','investigate','increase','decrease','reveal','vary','discuss','stimulus','open','involve', 'trial','hz'))
spacy_initialize(virtualenv = 'C:/Users/andche/spacy2')
txt_parsed <- spacy_parse(abstract_texts, tag = TRUE)
txt_parsed$token <- ifelse(txt_parsed$tag %in% c("NNS",'VBD','VBN'),
txt_parsed$lemma,
txt_parsed$token)
n_grams <- tokens_ngrams(tokens(as.tokens(txt_parsed[txt_parsed$pos %in% c('NOUN','ADJ','VERB') & txt_parsed$lemma %nin% my_stopwords,], use_lemma = T), remove_punct = TRUE, remove_symbols = T, remove_separators = T, remove_numbers = T), n = 1:3, concatenator = '_', skip = 0:1)
dfmat <- dfm(n_grams, remove = my_stopwords)
dfmat_trimmed <- dfm_trim(dfmat, min_termfreq = 20, min_docfreq = 15)
textplot_wordcloud(dfmat_trimmed, min_size = 4, max_size = 16, random_color = T)
lda_res <- stm(dfmat_trimmed, K = 30)
par(cex = .5)
res = 240
ragg::agg_png('ecvp_topics.png', width = 10*res, height = 8*res, res = res)
plot(lda_res, labeltype = 'frex', n = 5, main = 'Specific words', frexw=0.9)
dev.off()
plot(lda_res, labeltype = 'prob', n = 5, main = 'Most probable words')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment