Last active
March 4, 2019 21:16
-
-
Save kaddynator/eb543793a1880ee4bd9d5e33bba319b9 to your computer and use it in GitHub Desktop.
Document Term Matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(readr) | |
library(slam) | |
library(quanteda); | |
library(tidyverse); | |
library(RColorBrewer) | |
###--------------------creating document term matrix | |
##remove stopwords, punctuation, symbols, | |
##the TEXTINPUT is the input text for the dfm | |
dfm <- dfm(TEXTINPUT, | |
remove = c(stopwords("english")), | |
ngrams=1L, | |
stem = F, | |
remove_numbers = TRUE, | |
remove_punct = TRUE, | |
remove_symbols = TRUE) | |
vdfm <- dfm_trim(dfm, min_termfreq = 10, min_docfreq = 5) | |
# min_count = remove words used less than x | |
# min_docfreq = remove words used in less than x docs | |
topfeatures(vdfm, n = 50) | |
#Let’s plot two word clouds: one with the raw term frequencies and one with TF-IDF | |
textplot_wordcloud(vdfm, scale=c(6, 2), colors=brewer.pal(8, "Dark2"), | |
random.order = F, rot.per=0.1, max.words=250, main = "Raw Counts") | |
textplot_wordcloud(dfm_tfidf(vdfm), scale=c(3.5, .75), colors=brewer.pal(8, "Dark2"), | |
random.order = F, colormin_size=0.1, max.words=250, main = "TF-IDF") | |
###--------creating dendogram | |
numWords <- 50 | |
dfm_weight(dfm_tfidf(vdfm)) | |
wordDfm <- dfm_sort(dfm_weight(dfm_tfidf(vdfm))) | |
wordDfm <- t(wordDfm)[1:numWords,] # keep the top numWords words | |
wordDistMat <- dist(wordDfm) | |
wordCluster <- hclust(wordDistMat) | |
plot(wordCluster, xlab="", main="TF-IDF Frequency weighting (First 50 Words)") | |
###########topic modelling | |
library(topicmodels) | |
# we now export to a format that we can run the topic model with | |
dtm <- convert(vdfm, to="topicmodels") | |
###########normal approach######### | |
start.time <- Sys.time() | |
# estimate LDA with K topics | |
K <- 20 | |
lda <- LDA(dtm, k = K, method = "Gibbs", | |
control = list(verbose=25L, seed = 123, burnin = 100, iter = 100)) | |
total.time <- Sys.time() - start.time | |
normal_approach = total.time |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment