Last active
November 11, 2019 04:02
-
-
Save shawngraham/546fe82cdc6d520668475d6afe8d608f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#let's fix the first column in scrape | |
#i want to remove the first three characters, leaving us with a date | |
#or at least something that looks like a date | |
#this removes the diary metadata from the date | |
scrape$id <- substring(scrape$id, 4) | |
#this creates a new column with just the month extracted | |
month <- str_sub(scrape$id, 5, 6) | |
scrape['month'] <- month | |
library(tidytext) | |
text_df <- tibble(line = 1:179, text = scrape$entry, date = scrape$id, month = scrape$month) | |
text_df %>% | |
unnest_tokens(word, text) | |
tidy_diary <- text_df %>% | |
unnest_tokens(word, text) | |
data(stop_words) | |
tidy_diary <- tidy_diary %>% | |
anti_join(stop_words) | |
library(ggplot2) | |
tidy_diary %>% | |
count(word, sort = TRUE) %>% | |
filter(n > 20) %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(word, n)) + | |
geom_col() + | |
xlab(NULL) + | |
coord_flip() | |
diary_words <- tidy_diary %>% | |
count(date, word, sort = TRUE) | |
total_words <- diary_words %>% | |
group_by(date) %>% | |
summarize(total = sum(n)) | |
diary_words <- left_join(diary_words, total_words) | |
diary_words | |
ggplot(diary_words, aes(n/total, fill = date)) + | |
geom_histogram(show.legend = FALSE) + | |
facet_wrap(~date, scales = "free_y") | |
diary_words <- diary_words %>% | |
bind_tf_idf(word, date, n) | |
diary_words | |
diary_words %>% | |
arrange(desc(tf_idf)) %>% | |
mutate(word = factor(word, levels = rev(unique(word)))) %>% | |
group_by(date) %>% | |
top_n(15) %>% | |
ungroup() %>% | |
ggplot(aes(word, tf_idf, fill = date)) + | |
geom_col(show.legend = FALSE) + | |
labs(x = NULL, y = "tf-idf") + | |
facet_grid(~date, scales = "free") + | |
coord_flip() | |
mystopwords <- tibble(word = c("page","view","illegible","image","img","mr","said","will")) | |
diary_words <- anti_join(diary_words, mystopwords, | |
by = "word") | |
dtm <- diary_words %>% | |
cast_dtm(date, word, n) | |
dtm | |
d_lda <- LDA(dtm, k = 4, control = list(seed = 1234)) | |
d_lda | |
d_topics <- tidy(d_lda, matrix = "beta") | |
d_topics | |
d_top_terms <- d_topics %>% | |
group_by(topic) %>% | |
top_n(10, beta) %>% | |
ungroup() %>% | |
arrange(topic, -beta) | |
d_top_terms %>% | |
#mutate(term = reorder(term, beta, topic)) %>% | |
ggplot(aes(term, beta, fill = factor(topic))) + | |
geom_col(show.legend = FALSE) + | |
facet_wrap(~ topic, scales = "free") + | |
coord_flip() | |
library(tidyr) | |
beta_spread <- d_topics %>% | |
mutate(topic = paste0("topic", topic)) %>% | |
spread(topic, beta) %>% | |
filter(topic1 > .001 | topic2 > .001) %>% | |
mutate(log_ratio = log2(topic2 / topic1)) | |
beta_spread | |
beta_spread %>% | |
group_by(direction = log_ratio > 0) %>% | |
top_n(10, abs(log_ratio)) %>% | |
ungroup() %>% | |
mutate(term = reorder(term, log_ratio)) %>% | |
ggplot(aes(term, log_ratio)) + | |
geom_col() + | |
labs(y = "Log2 ratio of beta in topic 2 / topic 1") + | |
coord_flip() | |
d_documents <- tidy(d_lda, matrix = "gamma") | |
d_documents | |
#didn't need to do it above | |
#just needed to do it here | |
#so that I could add it to the | |
#plot | |
month <- str_sub(d_documents$document, 5, 6) | |
d_documents['month'] <- month | |
#plot the diary over time! | |
#and somewhere, back at the beginning, | |
#use slice to remove the one diary entry from the previous year | |
ggplot(d_documents, aes(x=month, y=gamma, fill=topic)) + | |
geom_bar(stat = "identity") + ylab("proportion") + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment