shawngraham/topic-model-from-one-diary-scrape.r

## topic-model-from-one-diary-scrape.r
#let's fix the first column in scrape
#i want to remove the first three characters, leaving us with a date
#or at least something that looks like a date


scrape$id <- substring(scrape$id, 4)

library(tm)


#turn entries into a corpus object
docs <- Corpus(VectorSource(scrape$entry))
docs <- tm_map(docs, removePunctuation)
#Transform to lower case
docs <- tm_map(docs,content_transformer(tolower))
#Strip digits
docs <- tm_map(docs, removeNumbers)
#Remove stopwords from standard stopword list
docs <- tm_map(docs, removeWords, c(stopwords("english"),"page","view","illegible","image","img","mr","said","will"))
#Strip whitespace (cosmetic?)
docs <- tm_map(docs, stripWhitespace)
#Stem document to ensure words that have same meaning or different verb forms of the same word arent duplicated
docs <- tm_map(docs,stemDocument)
#Create document-term matrix
dtm <- DocumentTermMatrix(docs)
dtm

#remove any empty rows
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm.new   <- dtm[rowTotals> 0, ]           #remove all docs without words

##topicmodels
#Load Topic models
library(topicmodels)
library(tidytext)
library(ggplot2)

#https://www.tidytextmining.com/topicmodeling.html
# this line might take a while. We're looking for ten topics
JA_topic_model<-LDA(dtm, k=10, control = list(seed = 321))

#word topic probabilities
JA_topics <- tidy(JA_topic_model, matrix = "beta")
JA_topics

text_top_terms <- JA_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

text_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

#view the terms per topic
#https://tm4ss.github.io/docs/Tutorial_6_Topic_Models.html
terms(JA_topic_model, 10)

top5termsPerTopic <- terms(JA_topic_model, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
topicNames
	#let's fix the first column in scrape
	#i want to remove the first three characters, leaving us with a date
	#or at least something that looks like a date


	scrape$id <- substring(scrape$id, 4)

	library(tm)


	#turn entries into a corpus object
	docs <- Corpus(VectorSource(scrape$entry))
	docs <- tm_map(docs, removePunctuation)
	#Transform to lower case
	docs <- tm_map(docs,content_transformer(tolower))
	#Strip digits
	docs <- tm_map(docs, removeNumbers)
	#Remove stopwords from standard stopword list
	docs <- tm_map(docs, removeWords, c(stopwords("english"),"page","view","illegible","image","img","mr","said","will"))
	#Strip whitespace (cosmetic?)
	docs <- tm_map(docs, stripWhitespace)
	#Stem document to ensure words that have same meaning or different verb forms of the same word arent duplicated
	docs <- tm_map(docs,stemDocument)
	#Create document-term matrix
	dtm <- DocumentTermMatrix(docs)
	dtm

	#remove any empty rows
	rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
	dtm.new <- dtm[rowTotals> 0, ] #remove all docs without words

	##topicmodels
	#Load Topic models
	library(topicmodels)
	library(tidytext)
	library(ggplot2)

	#https://www.tidytextmining.com/topicmodeling.html
	# this line might take a while. We're looking for ten topics
	JA_topic_model<-LDA(dtm, k=10, control = list(seed = 321))

	#word topic probabilities
	JA_topics <- tidy(JA_topic_model, matrix = "beta")
	JA_topics

	text_top_terms <- JA_topics %>%
	group_by(topic) %>%
	top_n(10, beta) %>%
	ungroup() %>%
	arrange(topic, -beta)

	text_top_terms %>%
	mutate(term = reorder(term, beta)) %>%
	ggplot(aes(term, beta, fill = factor(topic))) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~ topic, scales = "free") +
	coord_flip()

	#view the terms per topic
	#https://tm4ss.github.io/docs/Tutorial_6_Topic_Models.html
	terms(JA_topic_model, 10)

	top5termsPerTopic <- terms(JA_topic_model, 5)
	topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
	topicNames