kaddynator/DFM.R

## DFM.R
library(readr)
library(slam)
library(quanteda);
library(tidyverse);
library(RColorBrewer)

###--------------------creating document term matrix
##remove stopwords,  punctuation, symbols,
##the TEXTINPUT is the input text for the dfm
dfm <- dfm(TEXTINPUT,
           remove = c(stopwords("english")),
           ngrams=1L,
           stem = F,
           remove_numbers = TRUE,
           remove_punct = TRUE,
           remove_symbols = TRUE)

vdfm <- dfm_trim(dfm, min_termfreq = 10, min_docfreq = 5)
# min_count = remove words used less than x
# min_docfreq = remove words used in less than x docs

topfeatures(vdfm, n = 50)

#Let’s plot two word clouds: one with the raw term frequencies and one with TF-IDF
textplot_wordcloud(vdfm,  scale=c(6, 2), colors=brewer.pal(8, "Dark2"),
                   random.order = F, rot.per=0.1, max.words=250, main = "Raw Counts")

textplot_wordcloud(dfm_tfidf(vdfm),  scale=c(3.5, .75), colors=brewer.pal(8, "Dark2"),
                   random.order = F, colormin_size=0.1, max.words=250, main = "TF-IDF")


###--------creating dendogram

numWords <- 50
dfm_weight(dfm_tfidf(vdfm))

wordDfm <- dfm_sort(dfm_weight(dfm_tfidf(vdfm)))
wordDfm <- t(wordDfm)[1:numWords,]  # keep the top numWords words
wordDistMat <- dist(wordDfm)
wordCluster <- hclust(wordDistMat)
plot(wordCluster, xlab="", main="TF-IDF Frequency weighting (First 50 Words)")


###########topic modelling

library(topicmodels)

# we now export to a format that we can run the topic model with
dtm <- convert(vdfm, to="topicmodels")

###########normal approach#########
start.time <- Sys.time()
# estimate LDA with K topics
K <- 20

lda <- LDA(dtm, k = K, method = "Gibbs",
           control = list(verbose=25L, seed = 123, burnin = 100, iter = 100))
total.time <- Sys.time() - start.time
normal_approach = total.time
	library(readr)
	library(slam)
	library(quanteda);
	library(tidyverse);
	library(RColorBrewer)

	###--------------------creating document term matrix
	##remove stopwords, punctuation, symbols,
	##the TEXTINPUT is the input text for the dfm
	dfm <- dfm(TEXTINPUT,
	remove = c(stopwords("english")),
	ngrams=1L,
	stem = F,
	remove_numbers = TRUE,
	remove_punct = TRUE,
	remove_symbols = TRUE)

	vdfm <- dfm_trim(dfm, min_termfreq = 10, min_docfreq = 5)
	# min_count = remove words used less than x
	# min_docfreq = remove words used in less than x docs

	topfeatures(vdfm, n = 50)

	#Let’s plot two word clouds: one with the raw term frequencies and one with TF-IDF
	textplot_wordcloud(vdfm, scale=c(6, 2), colors=brewer.pal(8, "Dark2"),
	random.order = F, rot.per=0.1, max.words=250, main = "Raw Counts")

	textplot_wordcloud(dfm_tfidf(vdfm), scale=c(3.5, .75), colors=brewer.pal(8, "Dark2"),
	random.order = F, colormin_size=0.1, max.words=250, main = "TF-IDF")



	###--------creating dendogram

	numWords <- 50
	dfm_weight(dfm_tfidf(vdfm))

	wordDfm <- dfm_sort(dfm_weight(dfm_tfidf(vdfm)))
	wordDfm <- t(wordDfm)[1:numWords,] # keep the top numWords words
	wordDistMat <- dist(wordDfm)
	wordCluster <- hclust(wordDistMat)
	plot(wordCluster, xlab="", main="TF-IDF Frequency weighting (First 50 Words)")


	###########topic modelling

	library(topicmodels)

	# we now export to a format that we can run the topic model with
	dtm <- convert(vdfm, to="topicmodels")

	###########normal approach#########
	start.time <- Sys.time()
	# estimate LDA with K topics
	K <- 20

	lda <- LDA(dtm, k = K, method = "Gibbs",
	control = list(verbose=25L, seed = 123, burnin = 100, iter = 100))
	total.time <- Sys.time() - start.time
	normal_approach = total.time