Skip to content

Instantly share code, notes, and snippets.

@kaddynator
Last active March 4, 2019 21:16
Show Gist options
  • Save kaddynator/eb543793a1880ee4bd9d5e33bba319b9 to your computer and use it in GitHub Desktop.
Save kaddynator/eb543793a1880ee4bd9d5e33bba319b9 to your computer and use it in GitHub Desktop.
Document Term Matrix
library(readr)
library(slam)
library(quanteda);
library(tidyverse);
library(RColorBrewer)
###--------------------creating document term matrix
##remove stopwords, punctuation, symbols,
##the TEXTINPUT is the input text for the dfm
dfm <- dfm(TEXTINPUT,
remove = c(stopwords("english")),
ngrams=1L,
stem = F,
remove_numbers = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE)
vdfm <- dfm_trim(dfm, min_termfreq = 10, min_docfreq = 5)
# min_count = remove words used less than x
# min_docfreq = remove words used in less than x docs
topfeatures(vdfm, n = 50)
#Let’s plot two word clouds: one with the raw term frequencies and one with TF-IDF
textplot_wordcloud(vdfm, scale=c(6, 2), colors=brewer.pal(8, "Dark2"),
random.order = F, rot.per=0.1, max.words=250, main = "Raw Counts")
textplot_wordcloud(dfm_tfidf(vdfm), scale=c(3.5, .75), colors=brewer.pal(8, "Dark2"),
random.order = F, colormin_size=0.1, max.words=250, main = "TF-IDF")
###--------creating dendogram
numWords <- 50
dfm_weight(dfm_tfidf(vdfm))
wordDfm <- dfm_sort(dfm_weight(dfm_tfidf(vdfm)))
wordDfm <- t(wordDfm)[1:numWords,] # keep the top numWords words
wordDistMat <- dist(wordDfm)
wordCluster <- hclust(wordDistMat)
plot(wordCluster, xlab="", main="TF-IDF Frequency weighting (First 50 Words)")
###########topic modelling
library(topicmodels)
# we now export to a format that we can run the topic model with
dtm <- convert(vdfm, to="topicmodels")
###########normal approach#########
start.time <- Sys.time()
# estimate LDA with K topics
K <- 20
lda <- LDA(dtm, k = K, method = "Gibbs",
control = list(verbose=25L, seed = 123, burnin = 100, iter = 100))
total.time <- Sys.time() - start.time
normal_approach = total.time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment