Last active
November 11, 2019 02:48
-
-
Save shawngraham/183482f9eda1ed66102d118fea1d0542 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#let's fix the first column in scrape | |
#i want to remove the first three characters, leaving us with a date | |
#or at least something that looks like a date | |
scrape$id <- substring(scrape$id, 4) | |
library(tm) | |
#turn entries into a corpus object | |
docs <- Corpus(VectorSource(scrape$entry)) | |
docs <- tm_map(docs, removePunctuation) | |
#Transform to lower case | |
docs <- tm_map(docs,content_transformer(tolower)) | |
#Strip digits | |
docs <- tm_map(docs, removeNumbers) | |
#Remove stopwords from standard stopword list | |
docs <- tm_map(docs, removeWords, c(stopwords("english"),"page","view","illegible","image","img","mr","said","will")) | |
#Strip whitespace (cosmetic?) | |
docs <- tm_map(docs, stripWhitespace) | |
#Stem document to ensure words that have same meaning or different verb forms of the same word arent duplicated | |
docs <- tm_map(docs,stemDocument) | |
#Create document-term matrix | |
dtm <- DocumentTermMatrix(docs) | |
dtm | |
#remove any empty rows | |
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document | |
dtm.new <- dtm[rowTotals> 0, ] #remove all docs without words | |
##topicmodels | |
#Load Topic models | |
library(topicmodels) | |
library(tidytext) | |
library(ggplot2) | |
#https://www.tidytextmining.com/topicmodeling.html | |
# this line might take a while. We're looking for ten topics | |
JA_topic_model<-LDA(dtm, k=10, control = list(seed = 321)) | |
#word topic probabilities | |
JA_topics <- tidy(JA_topic_model, matrix = "beta") | |
JA_topics | |
text_top_terms <- JA_topics %>% | |
group_by(topic) %>% | |
top_n(10, beta) %>% | |
ungroup() %>% | |
arrange(topic, -beta) | |
text_top_terms %>% | |
mutate(term = reorder(term, beta)) %>% | |
ggplot(aes(term, beta, fill = factor(topic))) + | |
geom_col(show.legend = FALSE) + | |
facet_wrap(~ topic, scales = "free") + | |
coord_flip() | |
#view the terms per topic | |
#https://tm4ss.github.io/docs/Tutorial_6_Topic_Models.html | |
terms(JA_topic_model, 10) | |
top5termsPerTopic <- terms(JA_topic_model, 5) | |
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ") | |
topicNames |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment