shawngraham/text-analysis-and-topic-model-from-scraping-one-set-of-diaries.r

## text-analysis-and-topic-model-from-scraping-one-set-of-diaries.r
#let's fix the first column in scrape
#i want to remove the first three characters, leaving us with a date
#or at least something that looks like a date

#this removes the diary metadata from the date
scrape$id <- substring(scrape$id, 4)

#this creates a new column with just the month extracted
month <- str_sub(scrape$id, 5, 6)
scrape['month'] <- month


library(tidytext)

text_df <- tibble(line = 1:179, text = scrape$entry, date = scrape$id, month = scrape$month)
text_df %>%
  unnest_tokens(word, text)

tidy_diary <- text_df %>%
  unnest_tokens(word, text)

data(stop_words)

tidy_diary <- tidy_diary %>%
  anti_join(stop_words)

library(ggplot2)

tidy_diary %>%
  count(word, sort = TRUE) %>%
  filter(n > 20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()


diary_words <- tidy_diary %>%
  count(date, word, sort = TRUE)

total_words <- diary_words %>%
  group_by(date) %>%
  summarize(total = sum(n))

diary_words <- left_join(diary_words, total_words)

diary_words

ggplot(diary_words, aes(n/total, fill = date)) +
  geom_histogram(show.legend = FALSE) +
  facet_wrap(~date, scales = "free_y")

diary_words <- diary_words %>%
  bind_tf_idf(word, date, n)

diary_words

diary_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  group_by(date) %>%
  top_n(15) %>%
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = date)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_grid(~date, scales = "free") +
  coord_flip()


mystopwords <- tibble(word = c("page","view","illegible","image","img","mr","said","will"))

diary_words <- anti_join(diary_words, mystopwords,
                           by = "word")

dtm <- diary_words %>%
         cast_dtm(date, word, n)

dtm
d_lda <- LDA(dtm, k = 4, control = list(seed = 1234))
d_lda

d_topics <- tidy(d_lda, matrix = "beta")
d_topics

d_top_terms <- d_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

d_top_terms %>%
  #mutate(term = reorder(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

library(tidyr)
beta_spread <- d_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread

beta_spread %>%
  group_by(direction = log_ratio > 0) %>%
  top_n(10, abs(log_ratio)) %>%
  ungroup() %>%
  mutate(term = reorder(term, log_ratio)) %>%
  ggplot(aes(term, log_ratio)) +
  geom_col() +
  labs(y = "Log2 ratio of beta in topic 2 / topic 1") +
  coord_flip()

d_documents <- tidy(d_lda, matrix = "gamma")
d_documents

#didn't need to do it above
#just needed to do it here
#so that I could add it to the
#plot
month <- str_sub(d_documents$document, 5, 6)
d_documents['month'] <- month

#plot the diary over time!
#and somewhere, back at the beginning,
#use slice to remove the one diary entry from the previous year
ggplot(d_documents, aes(x=month, y=gamma, fill=topic)) +
  geom_bar(stat = "identity") + ylab("proportion") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
	#let's fix the first column in scrape
	#i want to remove the first three characters, leaving us with a date
	#or at least something that looks like a date

	#this removes the diary metadata from the date
	scrape$id <- substring(scrape$id, 4)

	#this creates a new column with just the month extracted
	month <- str_sub(scrape$id, 5, 6)
	scrape['month'] <- month


	library(tidytext)

	text_df <- tibble(line = 1:179, text = scrape$entry, date = scrape$id, month = scrape$month)
	text_df %>%
	unnest_tokens(word, text)

	tidy_diary <- text_df %>%
	unnest_tokens(word, text)

	data(stop_words)

	tidy_diary <- tidy_diary %>%
	anti_join(stop_words)

	library(ggplot2)

	tidy_diary %>%
	count(word, sort = TRUE) %>%
	filter(n > 20) %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(word, n)) +
	geom_col() +
	xlab(NULL) +
	coord_flip()


	diary_words <- tidy_diary %>%
	count(date, word, sort = TRUE)

	total_words <- diary_words %>%
	group_by(date) %>%
	summarize(total = sum(n))

	diary_words <- left_join(diary_words, total_words)

	diary_words

	ggplot(diary_words, aes(n/total, fill = date)) +
	geom_histogram(show.legend = FALSE) +
	facet_wrap(~date, scales = "free_y")

	diary_words <- diary_words %>%
	bind_tf_idf(word, date, n)

	diary_words

	diary_words %>%
	arrange(desc(tf_idf)) %>%
	mutate(word = factor(word, levels = rev(unique(word)))) %>%
	group_by(date) %>%
	top_n(15) %>%
	ungroup() %>%
	ggplot(aes(word, tf_idf, fill = date)) +
	geom_col(show.legend = FALSE) +
	labs(x = NULL, y = "tf-idf") +
	facet_grid(~date, scales = "free") +
	coord_flip()


	mystopwords <- tibble(word = c("page","view","illegible","image","img","mr","said","will"))

	diary_words <- anti_join(diary_words, mystopwords,
	by = "word")

	dtm <- diary_words %>%
	cast_dtm(date, word, n)

	dtm
	d_lda <- LDA(dtm, k = 4, control = list(seed = 1234))
	d_lda

	d_topics <- tidy(d_lda, matrix = "beta")
	d_topics

	d_top_terms <- d_topics %>%
	group_by(topic) %>%
	top_n(10, beta) %>%
	ungroup() %>%
	arrange(topic, -beta)

	d_top_terms %>%
	#mutate(term = reorder(term, beta, topic)) %>%
	ggplot(aes(term, beta, fill = factor(topic))) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~ topic, scales = "free") +
	coord_flip()

	library(tidyr)
	beta_spread <- d_topics %>%
	mutate(topic = paste0("topic", topic)) %>%
	spread(topic, beta) %>%
	filter(topic1 > .001 \| topic2 > .001) %>%
	mutate(log_ratio = log2(topic2 / topic1))

	beta_spread

	beta_spread %>%
	group_by(direction = log_ratio > 0) %>%
	top_n(10, abs(log_ratio)) %>%
	ungroup() %>%
	mutate(term = reorder(term, log_ratio)) %>%
	ggplot(aes(term, log_ratio)) +
	geom_col() +
	labs(y = "Log2 ratio of beta in topic 2 / topic 1") +
	coord_flip()

	d_documents <- tidy(d_lda, matrix = "gamma")
	d_documents

	#didn't need to do it above
	#just needed to do it here
	#so that I could add it to the
	#plot
	month <- str_sub(d_documents$document, 5, 6)
	d_documents['month'] <- month

	#plot the diary over time!
	#and somewhere, back at the beginning,
	#use slice to remove the one diary entry from the previous year
	ggplot(d_documents, aes(x=month, y=gamma, fill=topic)) +
	geom_bar(stat = "identity") + ylab("proportion") +
	theme(axis.text.x = element_text(angle = 90, hjust = 1))