Skip to content

Instantly share code, notes, and snippets.

@shawngraham
Last active November 11, 2019 02:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shawngraham/183482f9eda1ed66102d118fea1d0542 to your computer and use it in GitHub Desktop.
Save shawngraham/183482f9eda1ed66102d118fea1d0542 to your computer and use it in GitHub Desktop.
#let's fix the first column in scrape
#i want to remove the first three characters, leaving us with a date
#or at least something that looks like a date
scrape$id <- substring(scrape$id, 4)
library(tm)
#turn entries into a corpus object
docs <- Corpus(VectorSource(scrape$entry))
docs <- tm_map(docs, removePunctuation)
#Transform to lower case
docs <- tm_map(docs,content_transformer(tolower))
#Strip digits
docs <- tm_map(docs, removeNumbers)
#Remove stopwords from standard stopword list
docs <- tm_map(docs, removeWords, c(stopwords("english"),"page","view","illegible","image","img","mr","said","will"))
#Strip whitespace (cosmetic?)
docs <- tm_map(docs, stripWhitespace)
#Stem document to ensure words that have same meaning or different verb forms of the same word arent duplicated
docs <- tm_map(docs,stemDocument)
#Create document-term matrix
dtm <- DocumentTermMatrix(docs)
dtm
#remove any empty rows
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm.new <- dtm[rowTotals> 0, ] #remove all docs without words
##topicmodels
#Load Topic models
library(topicmodels)
library(tidytext)
library(ggplot2)
#https://www.tidytextmining.com/topicmodeling.html
# this line might take a while. We're looking for ten topics
JA_topic_model<-LDA(dtm, k=10, control = list(seed = 321))
#word topic probabilities
JA_topics <- tidy(JA_topic_model, matrix = "beta")
JA_topics
text_top_terms <- JA_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
text_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
#view the terms per topic
#https://tm4ss.github.io/docs/Tutorial_6_Topic_Models.html
terms(JA_topic_model, 10)
top5termsPerTopic <- terms(JA_topic_model, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
topicNames
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment