Skip to content

Instantly share code, notes, and snippets.

@therohk
Created October 21, 2019 11:46
Show Gist options
  • Save therohk/e88ea57861657c8b2eb4ac29120baffd to your computer and use it in GitHub Desktop.
Save therohk/e88ea57861657c8b2eb4ac29120baffd to your computer and use it in GitHub Desktop.
# ---- publish-date-bigram-tfidf
plot_bigram_tfidf <- function(dbin, start, stend, slice_unit="year", nmin=5, ntop=10, ncol=3) {
bigram_tf_idf <- dbin %>%
subset(select=c(bigram,publish_date)) %>%
mutate(Date = as.Date(substr(publish_date,1,8), format="%Y%m%d")) %>%
filter(Date>=start & Date<=stend) %>%
mutate(time_slice = floor_date(Date, unit = slice_unit)) %>%
mutate(time_slice = format(time_slice, "%Y-%m-%d")) %>%
count(time_slice, bigram) %>%
filter(between(n, nmin, 10000)) %>%
bind_tf_idf(bigram, time_slice, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf.plot <- bigram_tf_idf %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
arrange(time_slice, tf_idf) %>%
group_by(time_slice) %>%
top_n(n = ntop) %>%
ungroup()
plot_title <- paste("Ntop", ntop, "bigram tf-idf score", start, "to", stend, "slby", slice_unit)
bigram_tf_idf.plot %>%
ggplot(aes(x = bigram, y = tf_idf, fill = time_slice)) +
geom_col(show.legend = FALSE) +
labs(x = "bi-grm", y = "tf-idf") +
facet_wrap(~time_slice, ncol = ncol, scales = "free") +
theme(text = element_text(size = 10)) +
ggtitle(plot_title) +
coord_flip()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment