Last active
January 29, 2019 02:18
-
-
Save whatalnk/3e3b80b612b513afbb08b0f6d665ceb1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("dplyr") | |
library("tidyr") | |
library("stringr") | |
library("lubridate") | |
library("purrr") | |
library("readr") | |
library("ggplot2") | |
library("rvest") | |
library("selectr") | |
library("httr") | |
library("xml2") | |
library("conflicted") | |
# Guest information from Guru wiki | |
# Guest and host | |
bs_people <- function(x){ | |
html_text(x) %>% | |
str_match_all("\\([^()]+\\)") %>% | |
purrr::pluck(1) %>% last() %>% | |
str_replace_all(c("\\("="", "\\)"="")) | |
} | |
# episode Number | |
bs_epNum_html <- function(x){ | |
x %>% | |
html_node("a") %>% | |
html_attr("href") %>% | |
parse_url() %>% | |
purrr::pluck("path") %>% | |
strsplit("/") %>% | |
purrr::pluck(1, 2) | |
} | |
# Guru wiki | |
html <- read_html("https://github.com/drikin/guruwiki/wiki/backspace.fm-%E3%82%A8%E3%83%94%E3%82%BD%E3%83%BC%E3%83%89%E4%B8%80%E8%A6%A7") | |
d_people <- html %>% | |
querySelector(".markdown-body") %>% | |
html_nodes("li") %>% | |
map(function(x){ | |
people <- bs_people(x) | |
ep <- bs_epNum_html(x) | |
list("ep" = ep, "people" = people) | |
}) %>% | |
bind_rows() | |
# Episode information from rss feed | |
# Epsode number | |
bs_epNum <- function(x){ | |
xml_find_first(x, ".//guid") %>% | |
xml_text() %>% | |
parse_url() %>% | |
purrr::pluck("path") %>% | |
strsplit("/") %>% | |
purrr::pluck(1, 2) | |
} | |
# Title | |
bs_title <- function(x){ | |
xml_find_first(x, ".//title") %>% | |
xml_text() | |
} | |
# Publish date | |
bs_pubDate <- function(x){ | |
xml_find_first(x, ".//pubDate") %>% | |
xml_text() %>% | |
parse_date_time(orders="a, d b Y H:M:S z", locale="C") | |
} | |
# Duration | |
bs_duration_text <- function(x){ | |
xml_find_first(x, ".//itunes:duration") %>% | |
xml_text() | |
} | |
bs_duration <- function(x){ | |
txt <- bs_duration_text(x) | |
suppressWarnings(ret <- hms(txt)) | |
if (is.na(ret)) { | |
ret <- ms(txt) | |
} | |
ret %>% | |
as.duration() %>% | |
as.numeric("seconds") | |
} | |
#' x nodes | |
bs_shownotes_html <- function(x){ | |
x %>% | |
xml_find_first(".//description") %>% | |
xml_text() | |
} | |
x <- xml2::read_xml("http://feeds.backspace.fm/backspacefm") | |
d <- xml_find_all(x, ".//item") %>% | |
map(function(x){ | |
ep <- bs_epNum(x) | |
title <- bs_title(x) | |
dr <- bs_duration(x) | |
dr_text <-bs_duration_text(x) | |
pd <- bs_pubDate(x) | |
shownotes <- bs_shownotes_html(x) | |
list("ep" = ep, | |
"title" = title, | |
"publishedDate" = pd, | |
"duration_sec"=dr, | |
"shownotes" = shownotes) | |
}) %>% | |
bind_rows() %>% | |
# Fix ep | |
mutate(ep=case_when( | |
str_detect(title, "G-side") & !str_detect(ep, "g") ~ sprintf("g%03d", as.integer(ep)), | |
TRUE ~ ep | |
)) %>% | |
mutate(side=case_when( | |
str_detect(ep, "vidcon") ~ "Vidcon", | |
str_detect(ep, "mitaimon") ~ "Mitaimon", | |
str_detect(ep, "basuke") ~ "Basuke", | |
str_detect(ep, "d") ~ "Danbo", | |
str_detect(ep, "v") ~ "Vim", | |
str_detect(ep, "z") ~ "Zenji", | |
str_detect(ep, "t") ~ "T", | |
str_detect(ep, "g") ~ "G", | |
str_detect(ep, "b") ~ "B", | |
TRUE ~ "A" | |
)) | |
joined <- full_join(d_people, d, by="ep") | |
by_people <- joined %>% | |
select(ep, people, duration_sec) %>% | |
transform(people = str_split(people, ",")) %>% | |
unnest(people) %>% | |
mutate(people = str_trim(people)) | |
by_people_summ <- by_people %>% | |
dplyr::filter(!is.na(people)) %>% | |
group_by(people) %>% | |
summarise(n = n(), total_time = sum(duration_sec)) %>% | |
arrange(desc(n)) %>% | |
ungroup() | |
joined %>% | |
arrange(ep) %>% | |
mutate(publishedDate = format(publishedDate, tz="UTC")) %>% | |
write_csv("backspacefm.csv", na="") | |
by_people_summ %>% | |
write_csv("backspacefm_by_people.csv") | |
# shownotes | |
bs_nd_ul <- function(x){ | |
x %>% | |
html_children() %>% | |
map_chr(function(x){ | |
x %>% html_text() | |
}) | |
} | |
bs_shownotes_neta <- function(nodes, i){ | |
ret <- c() | |
repeat { | |
if (i > length(nodes)) { | |
break | |
} | |
nd <- nodes[i] | |
nd_type <- html_name(nd) | |
nd_id <- html_attr(nd, "id", default = "") | |
if (nd_id == "提供" | nd_id == "主な機材"){ | |
break | |
} | |
if (nd_type == "ul" | nd_type == "ol") { | |
ret <- append(ret, bs_nd_ul(nd)) | |
} | |
i <- i + 1 | |
} | |
if (is.null(ret)){ | |
ret <- NA_character_ | |
} | |
list("ret" = ret, "i" = i) | |
} | |
kw_neta <- c("今日のネタ", "今回の内容", "今週のニュース", "関連リンク", "スポンサーリンク", "今週のスペシャル", "今週のネタリンク", "今週のお買い物", "今週のネタ", "今週のトピック", "一般ニュース", "参考リンク") | |
bs_shownotes_text <- function(x){ | |
ret <- c() | |
nodes <- x %>% | |
read_html() %>% | |
html_node(xpath=".//body") %>% | |
html_children() | |
i <- 1 | |
repeat { | |
if (i > length(nodes)) { | |
break | |
} | |
nd <- nodes[i] | |
nd_type <- html_name(nd) | |
nd_id <-html_attr(nd, "id", "") | |
if (nd_id %in% kw_neta | str_detect(nd_id, "^今週のゲスト") | str_detect(nd_id, "^今週のスペシャル")) { | |
i <- i + 1 | |
neta <- bs_shownotes_neta(nodes, i) | |
ret <- neta$ret | |
i <- neta$i | |
} | |
i <- i + 1 | |
} | |
if (is.null(ret)){ | |
ret <- NA_character_ | |
} | |
ret | |
} | |
# Shownotes | |
# A side only | |
# #165, #168, #169, #170, #172: Medium | |
# Ignore embedded YouTube | |
bs_shownotes_neta_medium <- function(x){ | |
url <- shownotes_medium_url[x] | |
if (is.na(url)){ | |
return(list(c(NA_character_))) | |
} | |
h <- read_html(url) | |
ret <- h %>% | |
html_node(xpath="//body//main//article//section") %>% | |
html_node(css=".section-content") %>% | |
html_node(css=".section-inner") %>% | |
html_nodes(xpath="ul") %>% | |
map(bs_nd_ul) %>% | |
flatten_chr() | |
list(ret) | |
} | |
shownotes_medium_url <- c( | |
"165" = "https://blog.backspace.fm/backspace-fm-165%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-%E6%9A%AB%E5%AE%9A%E7%94%A8-fa0846ff76ea", | |
"168" = "https://blog.backspace.fm/168-pixel-%E3%83%89%E3%83%AD%E3%83%BC%E3%83%B3-gopro%E5%A5%B3%E5%AD%90-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-527426afc5b7", | |
"169" = "https://blog.backspace.fm/backspace-fm-169-%E5%A5%B3%E5%AD%90%E9%AB%98%E7%94%9F%E3%81%AE%E5%AE%B6%E5%BA%AD%E6%95%99%E5%B8%AB%E3%81%A8%E3%81%AA%E3%81%A3%E3%81%9F%E3%83%89%E3%83%AA%E3%82%AD%E3%83%B3-%E8%96%84%E3%81%8F%E9%80%8F%E3%81%91%E3%81%A6%E3%82%8B%E3%83%AA%E3%82%A2%E3%83%AB-%E6%81%8B%E3%81%A7%E3%81%8D%E3%82%8B%E3%81%A8%E6%80%9D%E3%81%86-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-938032dec588", | |
"170" = "https://blog.backspace.fm/170-%E5%B0%8F%E5%B3%B6%E7%A0%94%E4%BA%BA%E3%81%95%E3%82%93%E3%81%8C%E8%AA%9E%E3%82%8B%E3%82%B2%E3%83%BC%E3%83%A0%E3%81%AE%E3%83%A2%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%A8youtube%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-deed6b393b8c", | |
"172" = "https://blog.backspace.fm/backspace-fm-172-macbook-pro-surface-studio%E7%8F%BE%E7%89%A9%E3%82%92%E8%A7%A6%E3%81%A3%E3%81%A6%E3%81%8D%E3%81%9F%E3%83%89%E3%83%AA%E3%82%AD%E3%83%B3%E3%81%8C%E8%AA%9E%E3%82%8Badobe-max%E8%A3%8F%E8%A9%B1-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-61c0b66cddb4" | |
) | |
# #075: Recap of 2014, ignore | |
# #051, #055: No shownotes | |
dd <- d %>% | |
dplyr::filter(side == "A") %>% | |
mutate(shownotes_text = map(shownotes, bs_shownotes_text)) %>% | |
rowwise() %>% | |
mutate(shownotes_text = if_else( | |
ep %in% c("165", "168", "169", "170", "172"), bs_shownotes_neta_medium(ep), list(shownotes_text))) | |
dd %>% | |
select(ep, shownotes_text) %>% | |
unnest() %>% | |
write_csv("backspace_shownotes.csv") | |
library("RMeCab") | |
words <- dd %>% | |
purrr::pluck("shownotes_text") %>% | |
map(function(x){ | |
# Remove emoji | |
x %>% emo::ji_replace_all("") %>% | |
map_if(!is.na(.), function(y){ | |
y %>% | |
enc2native() %>% | |
RMeCabC() | |
}) %>% flatten() | |
}) %>% flatten() | |
words_ <- unlist(words) | |
words_df <- data_frame(pos = names(words_), word = words_) | |
words_df %>% | |
dplyr::filter(pos == "名詞") %>% | |
group_by(word) %>% | |
summarise(n = n()) %>% | |
arrange(desc(n)) %>% | |
write_csv("words.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment