Skip to content

Instantly share code, notes, and snippets.

@dichika
Created December 13, 2015 17:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dichika/f5a1a143932286d70600 to your computer and use it in GitHub Desktop.
Save dichika/f5a1a143932286d70600 to your computer and use it in GitHub Desktop.
さだまさし
# さだまさし
library("rvest")
library("magrittr")
library("stringr")
library("dplyr")
library("broom")
urls <- paste0("http://www.livefans.jp/search/index/page:",
1:26,
"?option=2&keyword=%E3%81%95%E3%81%A0%E3%81%BE%E3%81%95%E3%81%97&setlist=1&genre=all&sort=e1#schBox"
)
# ライブ一覧URLを取得
data_live <- NULL
for(u in urls){
h <- read_html(u)
artists <- h %>% html_nodes(".guestArtist") %>% html_text()
artists <- artists[grepl("\\[出演\\]", artists)] # ゲストを除く
urls <- h %>% html_nodes(".ticon a") %>% html_attr("href")
icons <- h %>% html_nodes(".ticon a") %>% html_text()
urls <- urls[icons=="セットリスト"] # セットリストのURLに絞る
data_live <- rbind(data_live,
data.frame(artiists=artists,
urls=urls,
stringsAsFactors = FALSE)
)
Sys.sleep(1)
}
# セットリストを取得
data_live <- data_live[grepl("\\[出演\\] さだまさし", data_live$artiists),]
origin <- "http://www.livefans.jp"
urls_live <- paste0(origin, data_live$urls)
data_setlist <- NULL
for(u2 in urls_live){
h2 <- read_html(u2)
livename <- h2 %>% html_nodes(".dataBlock") %>% html_nodes(".liveName") %>% html_text()
if(length(livename)==0){
next # 複数アーティストの場合liveName2のブロックになるのでスキップ
}
date <- h2 %>% html_nodes(".dataBlock") %>% html_nodes(".date") %>% html_text()
setlist <- h2 %>% html_table()
data_setlist0 <- data.frame(livename,
date,
title=setlist[[2]],
stringsAsFactors = FALSE)
data_setlist <- rbind(data_setlist,data_setlist0)
Sys.sleep(1)
}
data_setlist2 <- data_setlist
# data_setlist <- data_setlist2
# タイトルから不要な文字列を除く
data_setlist <- data_setlist %>% rename(title=X1)
data_setlist$title <- data_setlist$title %>% str_replace_all("\n購入する.+", "")
data_setlist$encore <- grepl("^アンコール", data_setlist$title)
data_setlist$title <- data_setlist$title %>% str_replace_all("アンコール.+:\n", "")
data_setlist$title <- data_setlist$title %>% str_replace_all("アンコール:\n", "")
# 日付を処理
data_setlist$date <- data_setlist$date %>% str_extract_all("[0-9]{4}/[0-9]{2}/[0-9]{2}?",
simplify = TRUE) %>%
as.Date()
data_setlist$year <- as.numeric(format(data_setlist$date, "%Y"))
# 不要なデータの除去
data_setlist <- data_setlist %>% filter(livename != "a-nation island 坂崎幸之助のお台場フォーク村 10周年 第36夜 帰ってきたともえちゃんフォークジャンボリー")
# 一部データの修正
data_setlist$livename[grepl("Rainbow Night~虹", data_setlist$livename)] <- "30周年記念コンサート ~Rainbow Night~虹"
data_setlist$title[grepl("北の国から", data_setlist$title)&!grepl("メドレー", data_setlist$title)] <- "北の国から"
# 曲順の追加
data_setlist <- data_setlist %>% group_by(livename, date) %>% mutate(order=seq(length(livename)))
## 何曲選ぶべきか
res_count <- data_setlist %>% count(livename, year, date)
ggplot(res_count, aes(x=factor(year), y=n)) + geom_boxplot() + labs(x="")
## 2005年以降のtop20
# livename単位でランダムにサンプリングし頻度を集計
set.seed(1)
data_ref_date <- data_setlist %>% group_by(livename) %>% summarise(date = sample(date,1))
data_setlist_ref <- data_setlist %>% semi_join(data_ref_date)
num_livename <- data_setlist_ref %>% filter(year>=2005) %>% count(livename) %>% nrow
res_top20 <- data_setlist_ref %>% filter(year>=2005) %>% count(title) %>% mutate(per=round(100*n/num_livename, 1)) %>%
arrange(desc(n)) %>% head(20)
# 曲順として先頭によく来る曲は何か
res_first_top5 <- data_setlist_ref %>% filter(year>=2005, order==1) %>% count(title) %>% mutate(per=round(100*n/num_livename, 1)) %>%
arrange(desc(n)) %>% head(5)
# 曲順として最後によく来る曲は何か
res_last_top5 <- data_setlist_ref %>%
filter(year>=2005, !encore) %>% group_by(livename, date) %>%
filter(order==max(order)) %>%
count(title) %>% mutate(per=round(100*n/num_livename, 1)) %>%
arrange(desc(n)) %>% head(5)
# アンコールの曲数:大体1曲だが3曲もある
res_encore_count <- data_setlist %>% filter(encore) %>%
count(livename, year, date)
ggplot(res_encore_count, aes(x=factor(year), y=n)) + geom_boxplot() + labs(x="")
# 2005年以降のtop10アンコールでは何が歌われているか
# これは毎回異なる可能性が高いのでサンプリングしない
res_encore_top5 <- data_setlist %>% filter(year>=2005, encore) %>%
count(title) %>% arrange(desc(n)) %>% head(5)
# 有向グラフを描く
tmp <- data_setlist_ref %>% filter(!encore) %>% select(livename, title, order) %>% mutate(order=order-1)
data_setlist_ref2 <- data_setlist_ref %>% filter(!encore) %>%
inner_join(tmp,by=c("livename","date", "order"))
matrix_title <- data_setlist_ref2 %>%
count(title.x, title.y) %>%
group_by(title.x) %>% mutate(all=sum(n), per=round(100*n/all, 1))
matrix_title_sub <- matrix_title %>% filter(n>=3) # 3回以上出現している組み合わせに絞る
library("DiagrammeR")
sada_nodes <- create_nodes(nodes=unique(c(matrix_title_sub$title.x,
matrix_title_sub$title.y)),
style="filled",
fontcolor="black",
fillcolor="white")
sada_nodes$fillcolor[sada_nodes$nodes=="案山子"] <- "red" # 最初に出現しやすい案山子に赤
sada_nodes$fillcolor[sada_nodes$nodes=="修二会"] <- "blue" # 最後に出現しやすい修二会に青
sada_nodes$fontcolor[sada_nodes$nodes=="関白宣言"] <- "red" # 結果の強調
sada_nodes$fontcolor[sada_nodes$nodes=="関白失脚"] <- "red" # 結果の強調
sada_edges <- create_edges(from=matrix_title_sub$title.x,
to=matrix_title_sub$title.y)
sada_graph <- create_graph(sada_nodes, sada_edges)
render_graph(sada_graph)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment