Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#let's fix the first column in scrape
#i want to remove the first three characters, leaving us with a date
#or at least something that looks like a date
#this removes the diary metadata from the date
scrape$id <- substring(scrape$id, 4)
#this creates a new column with just the month extracted
month <- str_sub(scrape$id, 5, 6)
scrape['month'] <- month
library(tidytext)
text_df <- tibble(line = 1:179, text = scrape$entry, date = scrape$id, month = scrape$month)
text_df %>%
unnest_tokens(word, text)
tidy_diary <- text_df %>%
unnest_tokens(word, text)
data(stop_words)
tidy_diary <- tidy_diary %>%
anti_join(stop_words)
library(ggplot2)
tidy_diary %>%
count(word, sort = TRUE) %>%
filter(n > 20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
diary_words <- tidy_diary %>%
count(date, word, sort = TRUE)
total_words <- diary_words %>%
group_by(date) %>%
summarize(total = sum(n))
diary_words <- left_join(diary_words, total_words)
diary_words
ggplot(diary_words, aes(n/total, fill = date)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(~date, scales = "free_y")
diary_words <- diary_words %>%
bind_tf_idf(word, date, n)
diary_words
diary_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(date) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = date)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_grid(~date, scales = "free") +
coord_flip()
mystopwords <- tibble(word = c("page","view","illegible","image","img","mr","said","will"))
diary_words <- anti_join(diary_words, mystopwords,
by = "word")
dtm <- diary_words %>%
cast_dtm(date, word, n)
dtm
d_lda <- LDA(dtm, k = 4, control = list(seed = 1234))
d_lda
d_topics <- tidy(d_lda, matrix = "beta")
d_topics
d_top_terms <- d_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
d_top_terms %>%
#mutate(term = reorder(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
library(tidyr)
beta_spread <- d_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
beta_spread %>%
group_by(direction = log_ratio > 0) %>%
top_n(10, abs(log_ratio)) %>%
ungroup() %>%
mutate(term = reorder(term, log_ratio)) %>%
ggplot(aes(term, log_ratio)) +
geom_col() +
labs(y = "Log2 ratio of beta in topic 2 / topic 1") +
coord_flip()
d_documents <- tidy(d_lda, matrix = "gamma")
d_documents
#didn't need to do it above
#just needed to do it here
#so that I could add it to the
#plot
month <- str_sub(d_documents$document, 5, 6)
d_documents['month'] <- month
#plot the diary over time!
#and somewhere, back at the beginning,
#use slice to remove the one diary entry from the previous year
ggplot(d_documents, aes(x=month, y=gamma, fill=topic)) +
geom_bar(stat = "identity") + ylab("proportion") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment