Skip to content

Instantly share code, notes, and snippets.

@whatalnk
Last active January 29, 2019 02:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save whatalnk/3e3b80b612b513afbb08b0f6d665ceb1 to your computer and use it in GitHub Desktop.
Save whatalnk/3e3b80b612b513afbb08b0f6d665ceb1 to your computer and use it in GitHub Desktop.
library("dplyr")
library("tidyr")
library("stringr")
library("lubridate")
library("purrr")
library("readr")
library("ggplot2")
library("rvest")
library("selectr")
library("httr")
library("xml2")
library("conflicted")
# Guest information from Guru wiki
# Guest and host
bs_people <- function(x){
html_text(x) %>%
str_match_all("\\([^()]+\\)") %>%
purrr::pluck(1) %>% last() %>%
str_replace_all(c("\\("="", "\\)"=""))
}
# episode Number
bs_epNum_html <- function(x){
x %>%
html_node("a") %>%
html_attr("href") %>%
parse_url() %>%
purrr::pluck("path") %>%
strsplit("/") %>%
purrr::pluck(1, 2)
}
# Guru wiki
html <- read_html("https://github.com/drikin/guruwiki/wiki/backspace.fm-%E3%82%A8%E3%83%94%E3%82%BD%E3%83%BC%E3%83%89%E4%B8%80%E8%A6%A7")
d_people <- html %>%
querySelector(".markdown-body") %>%
html_nodes("li") %>%
map(function(x){
people <- bs_people(x)
ep <- bs_epNum_html(x)
list("ep" = ep, "people" = people)
}) %>%
bind_rows()
# Episode information from rss feed
# Epsode number
bs_epNum <- function(x){
xml_find_first(x, ".//guid") %>%
xml_text() %>%
parse_url() %>%
purrr::pluck("path") %>%
strsplit("/") %>%
purrr::pluck(1, 2)
}
# Title
bs_title <- function(x){
xml_find_first(x, ".//title") %>%
xml_text()
}
# Publish date
bs_pubDate <- function(x){
xml_find_first(x, ".//pubDate") %>%
xml_text() %>%
parse_date_time(orders="a, d b Y H:M:S z", locale="C")
}
# Duration
bs_duration_text <- function(x){
xml_find_first(x, ".//itunes:duration") %>%
xml_text()
}
bs_duration <- function(x){
txt <- bs_duration_text(x)
suppressWarnings(ret <- hms(txt))
if (is.na(ret)) {
ret <- ms(txt)
}
ret %>%
as.duration() %>%
as.numeric("seconds")
}
#' x nodes
bs_shownotes_html <- function(x){
x %>%
xml_find_first(".//description") %>%
xml_text()
}
x <- xml2::read_xml("http://feeds.backspace.fm/backspacefm")
d <- xml_find_all(x, ".//item") %>%
map(function(x){
ep <- bs_epNum(x)
title <- bs_title(x)
dr <- bs_duration(x)
dr_text <-bs_duration_text(x)
pd <- bs_pubDate(x)
shownotes <- bs_shownotes_html(x)
list("ep" = ep,
"title" = title,
"publishedDate" = pd,
"duration_sec"=dr,
"shownotes" = shownotes)
}) %>%
bind_rows() %>%
# Fix ep
mutate(ep=case_when(
str_detect(title, "G-side") & !str_detect(ep, "g") ~ sprintf("g%03d", as.integer(ep)),
TRUE ~ ep
)) %>%
mutate(side=case_when(
str_detect(ep, "vidcon") ~ "Vidcon",
str_detect(ep, "mitaimon") ~ "Mitaimon",
str_detect(ep, "basuke") ~ "Basuke",
str_detect(ep, "d") ~ "Danbo",
str_detect(ep, "v") ~ "Vim",
str_detect(ep, "z") ~ "Zenji",
str_detect(ep, "t") ~ "T",
str_detect(ep, "g") ~ "G",
str_detect(ep, "b") ~ "B",
TRUE ~ "A"
))
joined <- full_join(d_people, d, by="ep")
by_people <- joined %>%
select(ep, people, duration_sec) %>%
transform(people = str_split(people, ",")) %>%
unnest(people) %>%
mutate(people = str_trim(people))
by_people_summ <- by_people %>%
dplyr::filter(!is.na(people)) %>%
group_by(people) %>%
summarise(n = n(), total_time = sum(duration_sec)) %>%
arrange(desc(n)) %>%
ungroup()
joined %>%
arrange(ep) %>%
mutate(publishedDate = format(publishedDate, tz="UTC")) %>%
write_csv("backspacefm.csv", na="")
by_people_summ %>%
write_csv("backspacefm_by_people.csv")
# shownotes
bs_nd_ul <- function(x){
x %>%
html_children() %>%
map_chr(function(x){
x %>% html_text()
})
}
bs_shownotes_neta <- function(nodes, i){
ret <- c()
repeat {
if (i > length(nodes)) {
break
}
nd <- nodes[i]
nd_type <- html_name(nd)
nd_id <- html_attr(nd, "id", default = "")
if (nd_id == "提供" | nd_id == "主な機材"){
break
}
if (nd_type == "ul" | nd_type == "ol") {
ret <- append(ret, bs_nd_ul(nd))
}
i <- i + 1
}
if (is.null(ret)){
ret <- NA_character_
}
list("ret" = ret, "i" = i)
}
kw_neta <- c("今日のネタ", "今回の内容", "今週のニュース", "関連リンク", "スポンサーリンク", "今週のスペシャル", "今週のネタリンク", "今週のお買い物", "今週のネタ", "今週のトピック", "一般ニュース", "参考リンク")
bs_shownotes_text <- function(x){
ret <- c()
nodes <- x %>%
read_html() %>%
html_node(xpath=".//body") %>%
html_children()
i <- 1
repeat {
if (i > length(nodes)) {
break
}
nd <- nodes[i]
nd_type <- html_name(nd)
nd_id <-html_attr(nd, "id", "")
if (nd_id %in% kw_neta | str_detect(nd_id, "^今週のゲスト") | str_detect(nd_id, "^今週のスペシャル")) {
i <- i + 1
neta <- bs_shownotes_neta(nodes, i)
ret <- neta$ret
i <- neta$i
}
i <- i + 1
}
if (is.null(ret)){
ret <- NA_character_
}
ret
}
# Shownotes
# A side only
# #165, #168, #169, #170, #172: Medium
# Ignore embedded YouTube
bs_shownotes_neta_medium <- function(x){
url <- shownotes_medium_url[x]
if (is.na(url)){
return(list(c(NA_character_)))
}
h <- read_html(url)
ret <- h %>%
html_node(xpath="//body//main//article//section") %>%
html_node(css=".section-content") %>%
html_node(css=".section-inner") %>%
html_nodes(xpath="ul") %>%
map(bs_nd_ul) %>%
flatten_chr()
list(ret)
}
shownotes_medium_url <- c(
"165" = "https://blog.backspace.fm/backspace-fm-165%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-%E6%9A%AB%E5%AE%9A%E7%94%A8-fa0846ff76ea",
"168" = "https://blog.backspace.fm/168-pixel-%E3%83%89%E3%83%AD%E3%83%BC%E3%83%B3-gopro%E5%A5%B3%E5%AD%90-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-527426afc5b7",
"169" = "https://blog.backspace.fm/backspace-fm-169-%E5%A5%B3%E5%AD%90%E9%AB%98%E7%94%9F%E3%81%AE%E5%AE%B6%E5%BA%AD%E6%95%99%E5%B8%AB%E3%81%A8%E3%81%AA%E3%81%A3%E3%81%9F%E3%83%89%E3%83%AA%E3%82%AD%E3%83%B3-%E8%96%84%E3%81%8F%E9%80%8F%E3%81%91%E3%81%A6%E3%82%8B%E3%83%AA%E3%82%A2%E3%83%AB-%E6%81%8B%E3%81%A7%E3%81%8D%E3%82%8B%E3%81%A8%E6%80%9D%E3%81%86-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-938032dec588",
"170" = "https://blog.backspace.fm/170-%E5%B0%8F%E5%B3%B6%E7%A0%94%E4%BA%BA%E3%81%95%E3%82%93%E3%81%8C%E8%AA%9E%E3%82%8B%E3%82%B2%E3%83%BC%E3%83%A0%E3%81%AE%E3%83%A2%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%A8youtube%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-deed6b393b8c",
"172" = "https://blog.backspace.fm/backspace-fm-172-macbook-pro-surface-studio%E7%8F%BE%E7%89%A9%E3%82%92%E8%A7%A6%E3%81%A3%E3%81%A6%E3%81%8D%E3%81%9F%E3%83%89%E3%83%AA%E3%82%AD%E3%83%B3%E3%81%8C%E8%AA%9E%E3%82%8Badobe-max%E8%A3%8F%E8%A9%B1-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-61c0b66cddb4"
)
# #075: Recap of 2014, ignore
# #051, #055: No shownotes
dd <- d %>%
dplyr::filter(side == "A") %>%
mutate(shownotes_text = map(shownotes, bs_shownotes_text)) %>%
rowwise() %>%
mutate(shownotes_text = if_else(
ep %in% c("165", "168", "169", "170", "172"), bs_shownotes_neta_medium(ep), list(shownotes_text)))
dd %>%
select(ep, shownotes_text) %>%
unnest() %>%
write_csv("backspace_shownotes.csv")
library("RMeCab")
words <- dd %>%
purrr::pluck("shownotes_text") %>%
map(function(x){
# Remove emoji
x %>% emo::ji_replace_all("") %>%
map_if(!is.na(.), function(y){
y %>%
enc2native() %>%
RMeCabC()
}) %>% flatten()
}) %>% flatten()
words_ <- unlist(words)
words_df <- data_frame(pos = names(words_), word = words_)
words_df %>%
dplyr::filter(pos == "名詞") %>%
group_by(word) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
write_csv("words.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment