earino/process_blog.R

## process_blog.R
library(tidyverse)
library(rvest)
library(tidytext)
library(topicmodels)
library(ggplot2)
library(dplyr)
library(tidyr)

CACHE_DIR = "./cache3"
if (! file.exists(CACHE_DIR)) { dir.create(CACHE_DIR) }

NUM_TOPICS = 7 #change to whatever you want

raw_data <- read_csv("devblog-export.csv")
raw_data$cache_file <- str_c(file.path(CACHE_DIR, make.names(raw_data$Title)),
                             ".txt")

prepare_blog_post <- function(i) {
    row = raw_data[i,]
    cache_file = row$cache_file
    if (! file.exists(cache_file)) {
      article <- read_html(row$URL, option="NOERROR")
      article_text <- html_nodes(article, ".entry-content") %>% html_text()
      writeLines(article_text, cache_file)
    }

    content = readLines(cache_file)

    #remove the author section
    authors_start = min(which(str_detect(content, "About the Authors")))
    trimmed_content = content[1:(authors_start-1)]

    #kill blank lines
    trimmed_content = trimmed_content[! str_detect(trimmed_content, "^\\s*$")]
    content_frame = tibble(cache_file=cache_file, text=trimmed_content)
}

blog_content <- map_dfr(1:nrow(raw_data), prepare_blog_post)
enhanced_data <- inner_join(blog_content, raw_data)

tidy_blog <- enhanced_data %>% unnest_tokens("word", "text")
tidy_blog_count <- tidy_blog %>% anti_join(stop_words) %>%
  count(cache_file, word, sort = TRUE)

blog_dtm <- tidy_blog_count %>% cast_dtm(cache_file, word, n)
blog_lda <- LDA(blog_dtm, k = NUM_TOPICS, control = list(seed = 1234))
blog_topics <- tidy(blog_lda, matrix = "beta")

top_terms <- blog_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()
	library(tidyverse)
	library(rvest)
	library(tidytext)
	library(topicmodels)
	library(ggplot2)
	library(dplyr)
	library(tidyr)

	CACHE_DIR = "./cache3"
	if (! file.exists(CACHE_DIR)) { dir.create(CACHE_DIR) }

	NUM_TOPICS = 7 #change to whatever you want

	raw_data <- read_csv("devblog-export.csv")
	raw_data$cache_file <- str_c(file.path(CACHE_DIR, make.names(raw_data$Title)),
	".txt")

	prepare_blog_post <- function(i) {
	row = raw_data[i,]
	cache_file = row$cache_file
	if (! file.exists(cache_file)) {
	article <- read_html(row$URL, option="NOERROR")
	article_text <- html_nodes(article, ".entry-content") %>% html_text()
	writeLines(article_text, cache_file)
	}

	content = readLines(cache_file)

	#remove the author section
	authors_start = min(which(str_detect(content, "About the Authors")))
	trimmed_content = content[1:(authors_start-1)]

	#kill blank lines
	trimmed_content = trimmed_content[! str_detect(trimmed_content, "^\\s*$")]
	content_frame = tibble(cache_file=cache_file, text=trimmed_content)
	}

	blog_content <- map_dfr(1:nrow(raw_data), prepare_blog_post)
	enhanced_data <- inner_join(blog_content, raw_data)

	tidy_blog <- enhanced_data %>% unnest_tokens("word", "text")
	tidy_blog_count <- tidy_blog %>% anti_join(stop_words) %>%
	count(cache_file, word, sort = TRUE)

	blog_dtm <- tidy_blog_count %>% cast_dtm(cache_file, word, n)
	blog_lda <- LDA(blog_dtm, k = NUM_TOPICS, control = list(seed = 1234))
	blog_topics <- tidy(blog_lda, matrix = "beta")

	top_terms <- blog_topics %>%
	group_by(topic) %>%
	top_n(5, beta) %>%
	ungroup() %>%
	arrange(topic, -beta)

	top_terms %>%
	mutate(term = reorder_within(term, beta, topic)) %>%
	ggplot(aes(beta, term, fill = factor(topic))) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~ topic, scales = "free") +
	scale_y_reordered()