Skip to content

Instantly share code, notes, and snippets.

@earino
Created March 20, 2021 17:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save earino/bfd8b6ea04ee67250bd6847c44a36d2f to your computer and use it in GitHub Desktop.
Save earino/bfd8b6ea04ee67250bd6847c44a36d2f to your computer and use it in GitHub Desktop.
a pipeline to process the corporate blog
library(tidyverse)
library(rvest)
library(tidytext)
library(topicmodels)
library(ggplot2)
library(dplyr)
library(tidyr)
CACHE_DIR = "./cache3"
if (! file.exists(CACHE_DIR)) { dir.create(CACHE_DIR) }
NUM_TOPICS = 7 #change to whatever you want
raw_data <- read_csv("devblog-export.csv")
raw_data$cache_file <- str_c(file.path(CACHE_DIR, make.names(raw_data$Title)),
".txt")
prepare_blog_post <- function(i) {
row = raw_data[i,]
cache_file = row$cache_file
if (! file.exists(cache_file)) {
article <- read_html(row$URL, option="NOERROR")
article_text <- html_nodes(article, ".entry-content") %>% html_text()
writeLines(article_text, cache_file)
}
content = readLines(cache_file)
#remove the author section
authors_start = min(which(str_detect(content, "About the Authors")))
trimmed_content = content[1:(authors_start-1)]
#kill blank lines
trimmed_content = trimmed_content[! str_detect(trimmed_content, "^\\s*$")]
content_frame = tibble(cache_file=cache_file, text=trimmed_content)
}
blog_content <- map_dfr(1:nrow(raw_data), prepare_blog_post)
enhanced_data <- inner_join(blog_content, raw_data)
tidy_blog <- enhanced_data %>% unnest_tokens("word", "text")
tidy_blog_count <- tidy_blog %>% anti_join(stop_words) %>%
count(cache_file, word, sort = TRUE)
blog_dtm <- tidy_blog_count %>% cast_dtm(cache_file, word, n)
blog_lda <- LDA(blog_dtm, k = NUM_TOPICS, control = list(seed = 1234))
blog_topics <- tidy(blog_lda, matrix = "beta")
top_terms <- blog_topics %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment