achetverikov/ecvp_21_abstracts.R

## ecvp_21_abstracts.R
library(data.table)
library(xml2)
library(stm)
library(quanteda)
library(gofastr)
library(stopwords)
library(spacyr)
library(quanteda)
library(quanteda.textplots)
library(Hmisc)

abstracts_xml<-read_xml( 'https://ecvp2021.org/programme/all.xml', quiet = F)
abstracts_list <- as_list(abstracts_xml)
abstract_texts <- lapply(abstracts_list$abstracts, \(x) x$text[[1]][[1]])
abstract_texts <- unname(unlist(abstract_texts))

my_stopwords <-  prep_stopwords(c(stopwords('en'),stopwords('SMART'),'group','participant','subject','test','result','show','use','display','found','find','demonstrate','human','differ','study','studied','perform', 'test','experiment','correlat','one','two','row','three','csv','can','task','participants','results','presented','performance','perception','different','response','responses','condition','conditions','observers','suggest','suggests','informed','compared','measured','experiment','processed','ms','e.g.','investigate','increase','decrease','reveal','vary','discuss','stimulus','open','involve', 'trial','hz'))

spacy_initialize(virtualenv  = 'C:/Users/andche/spacy2')

txt_parsed <- spacy_parse(abstract_texts, tag = TRUE)

txt_parsed$token <- ifelse(txt_parsed$tag %in% c("NNS",'VBD','VBN'),
                           txt_parsed$lemma,
                           txt_parsed$token)

n_grams <- tokens_ngrams(tokens(as.tokens(txt_parsed[txt_parsed$pos %in% c('NOUN','ADJ','VERB') & txt_parsed$lemma %nin% my_stopwords,], use_lemma = T), remove_punct = TRUE, remove_symbols = T, remove_separators = T, remove_numbers = T), n = 1:3, concatenator = '_', skip = 0:1)

dfmat <- dfm(n_grams, remove = my_stopwords)

dfmat_trimmed <- dfm_trim(dfmat, min_termfreq = 20, min_docfreq = 15)
textplot_wordcloud(dfmat_trimmed, min_size = 4, max_size = 16, random_color = T)

lda_res <- stm(dfmat_trimmed, K = 30)

par(cex = .5)
res = 240
ragg::agg_png('ecvp_topics.png', width = 10*res, height = 8*res, res = res)
plot(lda_res, labeltype = 'frex', n = 5, main = 'Specific words', frexw=0.9)
dev.off()

plot(lda_res, labeltype = 'prob', n = 5, main = 'Most probable words')
	library(data.table)
	library(xml2)
	library(stm)
	library(quanteda)
	library(gofastr)
	library(stopwords)
	library(spacyr)
	library(quanteda)
	library(quanteda.textplots)
	library(Hmisc)

	abstracts_xml<-read_xml( 'https://ecvp2021.org/programme/all.xml', quiet = F)
	abstracts_list <- as_list(abstracts_xml)
	abstract_texts <- lapply(abstracts_list$abstracts, \(x) x$text[[1]][[1]])
	abstract_texts <- unname(unlist(abstract_texts))

	my_stopwords <- prep_stopwords(c(stopwords('en'),stopwords('SMART'),'group','participant','subject','test','result','show','use','display','found','find','demonstrate','human','differ','study','studied','perform', 'test','experiment','correlat','one','two','row','three','csv','can','task','participants','results','presented','performance','perception','different','response','responses','condition','conditions','observers','suggest','suggests','informed','compared','measured','experiment','processed','ms','e.g.','investigate','increase','decrease','reveal','vary','discuss','stimulus','open','involve', 'trial','hz'))

	spacy_initialize(virtualenv = 'C:/Users/andche/spacy2')

	txt_parsed <- spacy_parse(abstract_texts, tag = TRUE)

	txt_parsed$token <- ifelse(txt_parsed$tag %in% c("NNS",'VBD','VBN'),
	txt_parsed$lemma,
	txt_parsed$token)

	n_grams <- tokens_ngrams(tokens(as.tokens(txt_parsed[txt_parsed$pos %in% c('NOUN','ADJ','VERB') & txt_parsed$lemma %nin% my_stopwords,], use_lemma = T), remove_punct = TRUE, remove_symbols = T, remove_separators = T, remove_numbers = T), n = 1:3, concatenator = '_', skip = 0:1)

	dfmat <- dfm(n_grams, remove = my_stopwords)

	dfmat_trimmed <- dfm_trim(dfmat, min_termfreq = 20, min_docfreq = 15)
	textplot_wordcloud(dfmat_trimmed, min_size = 4, max_size = 16, random_color = T)

	lda_res <- stm(dfmat_trimmed, K = 30)

	par(cex = .5)
	res = 240
	ragg::agg_png('ecvp_topics.png', width = 10res, height = 8res, res = res)
	plot(lda_res, labeltype = 'frex', n = 5, main = 'Specific words', frexw=0.9)
	dev.off()

	plot(lda_res, labeltype = 'prob', n = 5, main = 'Most probable words')