whatalnk/backspacefm_stat.R

## backspacefm_stat.R
library("dplyr")
library("tidyr")
library("stringr")
library("lubridate")
library("purrr")
library("readr")
library("ggplot2")
library("rvest")
library("selectr")
library("httr")
library("xml2")
library("conflicted")

# Guest information from Guru wiki
# Guest and host
bs_people <- function(x){
  html_text(x) %>%
    str_match_all("\\([^()]+\\)") %>%
    purrr::pluck(1) %>% last() %>%
    str_replace_all(c("\\("="", "\\)"=""))
}


# episode Number
bs_epNum_html <- function(x){
  x %>%
    html_node("a") %>%
    html_attr("href") %>%
    parse_url() %>%
    purrr::pluck("path") %>%
    strsplit("/") %>%
    purrr::pluck(1, 2)
}

# Guru wiki
html <- read_html("https://github.com/drikin/guruwiki/wiki/backspace.fm-%E3%82%A8%E3%83%94%E3%82%BD%E3%83%BC%E3%83%89%E4%B8%80%E8%A6%A7")

d_people <- html %>%
  querySelector(".markdown-body") %>%
  html_nodes("li") %>%
  map(function(x){
    people <- bs_people(x)
    ep <- bs_epNum_html(x)
    list("ep" = ep, "people" = people)
  }) %>%
  bind_rows()

# Episode information from rss feed
# Epsode number
bs_epNum <- function(x){
  xml_find_first(x, ".//guid") %>%
    xml_text() %>%
    parse_url() %>%
    purrr::pluck("path") %>%
    strsplit("/") %>%
    purrr::pluck(1, 2)
}

# Title
bs_title <- function(x){
  xml_find_first(x, ".//title") %>%
    xml_text()
}

# Publish date
bs_pubDate <- function(x){
  xml_find_first(x, ".//pubDate") %>%
    xml_text() %>%
    parse_date_time(orders="a, d b Y H:M:S z", locale="C")
}

# Duration
bs_duration_text <- function(x){
  xml_find_first(x, ".//itunes:duration") %>%
    xml_text()
}

bs_duration <- function(x){
  txt <- bs_duration_text(x)
  suppressWarnings(ret <- hms(txt))
  if (is.na(ret)) {
    ret <- ms(txt)
  }
  ret %>%
    as.duration() %>%
    as.numeric("seconds")
}

#' x nodes
bs_shownotes_html <- function(x){
  x  %>%
    xml_find_first(".//description") %>%
    xml_text()
}

x <- xml2::read_xml("http://feeds.backspace.fm/backspacefm")

d <- xml_find_all(x, ".//item") %>%
  map(function(x){
    ep <- bs_epNum(x)
    title <- bs_title(x)
    dr <- bs_duration(x)
    dr_text <-bs_duration_text(x)
    pd <- bs_pubDate(x)
    shownotes <- bs_shownotes_html(x)
    list("ep" = ep,
         "title" = title,
         "publishedDate" = pd,
         "duration_sec"=dr,
         "shownotes" = shownotes)
  }) %>%
  bind_rows() %>%
  # Fix ep
  mutate(ep=case_when(
    str_detect(title, "G-side") & !str_detect(ep, "g") ~ sprintf("g%03d", as.integer(ep)),
    TRUE ~ ep
  )) %>%
  mutate(side=case_when(
    str_detect(ep, "vidcon") ~ "Vidcon",
    str_detect(ep, "mitaimon") ~ "Mitaimon",
    str_detect(ep, "basuke") ~ "Basuke",
    str_detect(ep, "d") ~ "Danbo",
    str_detect(ep, "v") ~ "Vim",
    str_detect(ep, "z") ~ "Zenji",
    str_detect(ep, "t") ~ "T",
    str_detect(ep, "g") ~ "G",
    str_detect(ep, "b") ~ "B",
    TRUE ~ "A"
  ))

joined <- full_join(d_people, d, by="ep")

by_people <- joined %>%
  select(ep, people, duration_sec) %>%
  transform(people = str_split(people, ",")) %>%
  unnest(people) %>%
  mutate(people = str_trim(people))

by_people_summ <- by_people %>%
  dplyr::filter(!is.na(people)) %>%
  group_by(people) %>%
  summarise(n = n(), total_time = sum(duration_sec)) %>%
  arrange(desc(n)) %>%
  ungroup()

joined %>%
  arrange(ep) %>%
  mutate(publishedDate = format(publishedDate, tz="UTC")) %>%
  write_csv("backspacefm.csv", na="")

by_people_summ %>%
  write_csv("backspacefm_by_people.csv")

# shownotes
bs_nd_ul <- function(x){
  x %>%
    html_children() %>%
    map_chr(function(x){
      x %>% html_text()
    })
}

bs_shownotes_neta <- function(nodes, i){
  ret <- c()
  repeat {
    if (i > length(nodes)) {
      break
    }
    nd <- nodes[i]
    nd_type <- html_name(nd)
    nd_id <- html_attr(nd, "id", default = "")
    if (nd_id == "提供" | nd_id == "主な機材"){
      break
    }
    if (nd_type == "ul" | nd_type == "ol") {
      ret <- append(ret, bs_nd_ul(nd))
    }
    i <- i + 1
  }
  if (is.null(ret)){
    ret <- NA_character_
  }
  list("ret" = ret, "i" = i)
}

kw_neta <- c("今日のネタ", "今回の内容", "今週のニュース", "関連リンク", "スポンサーリンク", "今週のスペシャル", "今週のネタリンク", "今週のお買い物", "今週のネタ", "今週のトピック", "一般ニュース", "参考リンク")
bs_shownotes_text <- function(x){
  ret <- c()
  nodes <- x %>%
    read_html() %>%
    html_node(xpath=".//body") %>%
    html_children()
  i <- 1
  repeat {
    if (i > length(nodes)) {
      break
    }
    nd <- nodes[i]
    nd_type <- html_name(nd)
    nd_id <-html_attr(nd, "id", "")
    if (nd_id %in% kw_neta | str_detect(nd_id, "^今週のゲスト") | str_detect(nd_id, "^今週のスペシャル")) {
      i <- i + 1
      neta <- bs_shownotes_neta(nodes, i)
      ret <- neta$ret
      i <- neta$i
    }
    i <- i + 1
  }
  if (is.null(ret)){
    ret <- NA_character_
  }
  ret
}

# Shownotes
# A side only
# #165, #168, #169, #170, #172: Medium
# Ignore embedded YouTube
bs_shownotes_neta_medium <- function(x){
  url <- shownotes_medium_url[x]
  if (is.na(url)){
    return(list(c(NA_character_)))
  }
  h <- read_html(url)
  ret <- h %>%
    html_node(xpath="//body//main//article//section") %>%
    html_node(css=".section-content") %>%
    html_node(css=".section-inner") %>%
    html_nodes(xpath="ul") %>%
    map(bs_nd_ul) %>%
    flatten_chr()
  list(ret)
}

shownotes_medium_url <- c(
  "165" = "https://blog.backspace.fm/backspace-fm-165%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-%E6%9A%AB%E5%AE%9A%E7%94%A8-fa0846ff76ea",
  "168" = "https://blog.backspace.fm/168-pixel-%E3%83%89%E3%83%AD%E3%83%BC%E3%83%B3-gopro%E5%A5%B3%E5%AD%90-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-527426afc5b7",
  "169" = "https://blog.backspace.fm/backspace-fm-169-%E5%A5%B3%E5%AD%90%E9%AB%98%E7%94%9F%E3%81%AE%E5%AE%B6%E5%BA%AD%E6%95%99%E5%B8%AB%E3%81%A8%E3%81%AA%E3%81%A3%E3%81%9F%E3%83%89%E3%83%AA%E3%82%AD%E3%83%B3-%E8%96%84%E3%81%8F%E9%80%8F%E3%81%91%E3%81%A6%E3%82%8B%E3%83%AA%E3%82%A2%E3%83%AB-%E6%81%8B%E3%81%A7%E3%81%8D%E3%82%8B%E3%81%A8%E6%80%9D%E3%81%86-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-938032dec588",
  "170" = "https://blog.backspace.fm/170-%E5%B0%8F%E5%B3%B6%E7%A0%94%E4%BA%BA%E3%81%95%E3%82%93%E3%81%8C%E8%AA%9E%E3%82%8B%E3%82%B2%E3%83%BC%E3%83%A0%E3%81%AE%E3%83%A2%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%A8youtube%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-deed6b393b8c",
  "172" = "https://blog.backspace.fm/backspace-fm-172-macbook-pro-surface-studio%E7%8F%BE%E7%89%A9%E3%82%92%E8%A7%A6%E3%81%A3%E3%81%A6%E3%81%8D%E3%81%9F%E3%83%89%E3%83%AA%E3%82%AD%E3%83%B3%E3%81%8C%E8%AA%9E%E3%82%8Badobe-max%E8%A3%8F%E8%A9%B1-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-61c0b66cddb4"
)


# #075: Recap of 2014, ignore
# #051, #055: No shownotes
dd <- d %>%
  dplyr::filter(side == "A") %>%
  mutate(shownotes_text = map(shownotes, bs_shownotes_text)) %>%
  rowwise() %>%
  mutate(shownotes_text = if_else(
    ep %in% c("165", "168", "169", "170", "172"), bs_shownotes_neta_medium(ep), list(shownotes_text)))

dd %>%
  select(ep, shownotes_text) %>%
  unnest() %>%
  write_csv("backspace_shownotes.csv")

library("RMeCab")
words <- dd %>%
  purrr::pluck("shownotes_text") %>%
  map(function(x){
    # Remove emoji
    x %>% emo::ji_replace_all("") %>%
      map_if(!is.na(.), function(y){
        y %>%
          enc2native() %>%
          RMeCabC()
      }) %>% flatten()
  }) %>% flatten()

words_ <- unlist(words)
words_df <- data_frame(pos = names(words_), word = words_)
words_df %>%
  dplyr::filter(pos == "名詞") %>%
  group_by(word) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  write_csv("words.csv")
	library("dplyr")
	library("tidyr")
	library("stringr")
	library("lubridate")
	library("purrr")
	library("readr")
	library("ggplot2")
	library("rvest")
	library("selectr")
	library("httr")
	library("xml2")
	library("conflicted")

	# Guest information from Guru wiki
	# Guest and host
	bs_people <- function(x){
	html_text(x) %>%
	str_match_all("\\([^()]+\\)") %>%
	purrr::pluck(1) %>% last() %>%
	str_replace_all(c("\\("="", "\\)"=""))
	}


	# episode Number
	bs_epNum_html <- function(x){
	x %>%
	html_node("a") %>%
	html_attr("href") %>%
	parse_url() %>%
	purrr::pluck("path") %>%
	strsplit("/") %>%
	purrr::pluck(1, 2)
	}

	# Guru wiki
	html <- read_html("https://github.com/drikin/guruwiki/wiki/backspace.fm-%E3%82%A8%E3%83%94%E3%82%BD%E3%83%BC%E3%83%89%E4%B8%80%E8%A6%A7")

	d_people <- html %>%
	querySelector(".markdown-body") %>%
	html_nodes("li") %>%
	map(function(x){
	people <- bs_people(x)
	ep <- bs_epNum_html(x)
	list("ep" = ep, "people" = people)
	}) %>%
	bind_rows()

	# Episode information from rss feed
	# Epsode number
	bs_epNum <- function(x){
	xml_find_first(x, ".//guid") %>%
	xml_text() %>%
	parse_url() %>%
	purrr::pluck("path") %>%
	strsplit("/") %>%
	purrr::pluck(1, 2)
	}

	# Title
	bs_title <- function(x){
	xml_find_first(x, ".//title") %>%
	xml_text()
	}

	# Publish date
	bs_pubDate <- function(x){
	xml_find_first(x, ".//pubDate") %>%
	xml_text() %>%
	parse_date_time(orders="a, d b Y H:M:S z", locale="C")
	}

	# Duration
	bs_duration_text <- function(x){
	xml_find_first(x, ".//itunes:duration") %>%
	xml_text()
	}

	bs_duration <- function(x){
	txt <- bs_duration_text(x)
	suppressWarnings(ret <- hms(txt))
	if (is.na(ret)) {
	ret <- ms(txt)
	}
	ret %>%
	as.duration() %>%
	as.numeric("seconds")
	}

	#' x nodes
	bs_shownotes_html <- function(x){
	x %>%
	xml_find_first(".//description") %>%
	xml_text()
	}

	x <- xml2::read_xml("http://feeds.backspace.fm/backspacefm")

	d <- xml_find_all(x, ".//item") %>%
	map(function(x){
	ep <- bs_epNum(x)
	title <- bs_title(x)
	dr <- bs_duration(x)
	dr_text <-bs_duration_text(x)
	pd <- bs_pubDate(x)
	shownotes <- bs_shownotes_html(x)
	list("ep" = ep,
	"title" = title,
	"publishedDate" = pd,
	"duration_sec"=dr,
	"shownotes" = shownotes)
	}) %>%
	bind_rows() %>%
	# Fix ep
	mutate(ep=case_when(
	str_detect(title, "G-side") & !str_detect(ep, "g") ~ sprintf("g%03d", as.integer(ep)),
	TRUE ~ ep
	)) %>%
	mutate(side=case_when(
	str_detect(ep, "vidcon") ~ "Vidcon",
	str_detect(ep, "mitaimon") ~ "Mitaimon",
	str_detect(ep, "basuke") ~ "Basuke",
	str_detect(ep, "d") ~ "Danbo",
	str_detect(ep, "v") ~ "Vim",
	str_detect(ep, "z") ~ "Zenji",
	str_detect(ep, "t") ~ "T",
	str_detect(ep, "g") ~ "G",
	str_detect(ep, "b") ~ "B",
	TRUE ~ "A"
	))

	joined <- full_join(d_people, d, by="ep")

	by_people <- joined %>%
	select(ep, people, duration_sec) %>%
	transform(people = str_split(people, ",")) %>%
	unnest(people) %>%
	mutate(people = str_trim(people))

	by_people_summ <- by_people %>%
	dplyr::filter(!is.na(people)) %>%
	group_by(people) %>%
	summarise(n = n(), total_time = sum(duration_sec)) %>%
	arrange(desc(n)) %>%
	ungroup()

	joined %>%
	arrange(ep) %>%
	mutate(publishedDate = format(publishedDate, tz="UTC")) %>%
	write_csv("backspacefm.csv", na="")

	by_people_summ %>%
	write_csv("backspacefm_by_people.csv")

	# shownotes
	bs_nd_ul <- function(x){
	x %>%
	html_children() %>%
	map_chr(function(x){
	x %>% html_text()
	})
	}

	bs_shownotes_neta <- function(nodes, i){
	ret <- c()
	repeat {
	if (i > length(nodes)) {
	break
	}
	nd <- nodes[i]
	nd_type <- html_name(nd)
	nd_id <- html_attr(nd, "id", default = "")
	if (nd_id == "提供" \| nd_id == "主な機材"){
	break
	}
	if (nd_type == "ul" \| nd_type == "ol") {
	ret <- append(ret, bs_nd_ul(nd))
	}
	i <- i + 1
	}
	if (is.null(ret)){
	ret <- NA_character_
	}
	list("ret" = ret, "i" = i)
	}

	kw_neta <- c("今日のネタ", "今回の内容", "今週のニュース", "関連リンク", "スポンサーリンク", "今週のスペシャル", "今週のネタリンク", "今週のお買い物", "今週のネタ", "今週のトピック", "一般ニュース", "参考リンク")
	bs_shownotes_text <- function(x){
	ret <- c()
	nodes <- x %>%
	read_html() %>%
	html_node(xpath=".//body") %>%
	html_children()
	i <- 1
	repeat {
	if (i > length(nodes)) {
	break
	}
	nd <- nodes[i]
	nd_type <- html_name(nd)
	nd_id <-html_attr(nd, "id", "")
	if (nd_id %in% kw_neta \| str_detect(nd_id, "^今週のゲスト") \| str_detect(nd_id, "^今週のスペシャル")) {
	i <- i + 1
	neta <- bs_shownotes_neta(nodes, i)
	ret <- neta$ret
	i <- neta$i
	}
	i <- i + 1
	}
	if (is.null(ret)){
	ret <- NA_character_
	}
	ret
	}

	# Shownotes
	# A side only
	# #165, #168, #169, #170, #172: Medium
	# Ignore embedded YouTube
	bs_shownotes_neta_medium <- function(x){
	url <- shownotes_medium_url[x]
	if (is.na(url)){
	return(list(c(NA_character_)))
	}
	h <- read_html(url)
	ret <- h %>%
	html_node(xpath="//body//main//article//section") %>%
	html_node(css=".section-content") %>%
	html_node(css=".section-inner") %>%
	html_nodes(xpath="ul") %>%
	map(bs_nd_ul) %>%
	flatten_chr()
	list(ret)
	}

	shownotes_medium_url <- c(
	"165" = "https://blog.backspace.fm/backspace-fm-165%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-%E6%9A%AB%E5%AE%9A%E7%94%A8-fa0846ff76ea",
	"168" = "https://blog.backspace.fm/168-pixel-%E3%83%89%E3%83%AD%E3%83%BC%E3%83%B3-gopro%E5%A5%B3%E5%AD%90-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-527426afc5b7",
	"169" = "https://blog.backspace.fm/backspace-fm-169-%E5%A5%B3%E5%AD%90%E9%AB%98%E7%94%9F%E3%81%AE%E5%AE%B6%E5%BA%AD%E6%95%99%E5%B8%AB%E3%81%A8%E3%81%AA%E3%81%A3%E3%81%9F%E3%83%89%E3%83%AA%E3%82%AD%E3%83%B3-%E8%96%84%E3%81%8F%E9%80%8F%E3%81%91%E3%81%A6%E3%82%8B%E3%83%AA%E3%82%A2%E3%83%AB-%E6%81%8B%E3%81%A7%E3%81%8D%E3%82%8B%E3%81%A8%E6%80%9D%E3%81%86-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-938032dec588",
	"170" = "https://blog.backspace.fm/170-%E5%B0%8F%E5%B3%B6%E7%A0%94%E4%BA%BA%E3%81%95%E3%82%93%E3%81%8C%E8%AA%9E%E3%82%8B%E3%82%B2%E3%83%BC%E3%83%A0%E3%81%AE%E3%83%A2%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%A8youtube%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-deed6b393b8c",
	"172" = "https://blog.backspace.fm/backspace-fm-172-macbook-pro-surface-studio%E7%8F%BE%E7%89%A9%E3%82%92%E8%A7%A6%E3%81%A3%E3%81%A6%E3%81%8D%E3%81%9F%E3%83%89%E3%83%AA%E3%82%AD%E3%83%B3%E3%81%8C%E8%AA%9E%E3%82%8Badobe-max%E8%A3%8F%E8%A9%B1-%E3%83%AA%E3%83%B3%E3%82%AF%E9%9B%86-61c0b66cddb4"
	)


	# #075: Recap of 2014, ignore
	# #051, #055: No shownotes
	dd <- d %>%
	dplyr::filter(side == "A") %>%
	mutate(shownotes_text = map(shownotes, bs_shownotes_text)) %>%
	rowwise() %>%
	mutate(shownotes_text = if_else(
	ep %in% c("165", "168", "169", "170", "172"), bs_shownotes_neta_medium(ep), list(shownotes_text)))

	dd %>%
	select(ep, shownotes_text) %>%
	unnest() %>%
	write_csv("backspace_shownotes.csv")

	library("RMeCab")
	words <- dd %>%
	purrr::pluck("shownotes_text") %>%
	map(function(x){
	# Remove emoji
	x %>% emo::ji_replace_all("") %>%
	map_if(!is.na(.), function(y){
	y %>%
	enc2native() %>%
	RMeCabC()
	}) %>% flatten()
	}) %>% flatten()

	words_ <- unlist(words)
	words_df <- data_frame(pos = names(words_), word = words_)
	words_df %>%
	dplyr::filter(pos == "名詞") %>%
	group_by(word) %>%
	summarise(n = n()) %>%
	arrange(desc(n)) %>%
	write_csv("words.csv")