Created
December 13, 2015 17:13
-
-
Save dichika/f5a1a143932286d70600 to your computer and use it in GitHub Desktop.
さだまさし
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# さだまさし | |
library("rvest") | |
library("magrittr") | |
library("stringr") | |
library("dplyr") | |
library("broom") | |
urls <- paste0("http://www.livefans.jp/search/index/page:", | |
1:26, | |
"?option=2&keyword=%E3%81%95%E3%81%A0%E3%81%BE%E3%81%95%E3%81%97&setlist=1&genre=all&sort=e1#schBox" | |
) | |
# ライブ一覧URLを取得 | |
data_live <- NULL | |
for(u in urls){ | |
h <- read_html(u) | |
artists <- h %>% html_nodes(".guestArtist") %>% html_text() | |
artists <- artists[grepl("\\[出演\\]", artists)] # ゲストを除く | |
urls <- h %>% html_nodes(".ticon a") %>% html_attr("href") | |
icons <- h %>% html_nodes(".ticon a") %>% html_text() | |
urls <- urls[icons=="セットリスト"] # セットリストのURLに絞る | |
data_live <- rbind(data_live, | |
data.frame(artiists=artists, | |
urls=urls, | |
stringsAsFactors = FALSE) | |
) | |
Sys.sleep(1) | |
} | |
# セットリストを取得 | |
data_live <- data_live[grepl("\\[出演\\] さだまさし", data_live$artiists),] | |
origin <- "http://www.livefans.jp" | |
urls_live <- paste0(origin, data_live$urls) | |
data_setlist <- NULL | |
for(u2 in urls_live){ | |
h2 <- read_html(u2) | |
livename <- h2 %>% html_nodes(".dataBlock") %>% html_nodes(".liveName") %>% html_text() | |
if(length(livename)==0){ | |
next # 複数アーティストの場合liveName2のブロックになるのでスキップ | |
} | |
date <- h2 %>% html_nodes(".dataBlock") %>% html_nodes(".date") %>% html_text() | |
setlist <- h2 %>% html_table() | |
data_setlist0 <- data.frame(livename, | |
date, | |
title=setlist[[2]], | |
stringsAsFactors = FALSE) | |
data_setlist <- rbind(data_setlist,data_setlist0) | |
Sys.sleep(1) | |
} | |
data_setlist2 <- data_setlist | |
# data_setlist <- data_setlist2 | |
# タイトルから不要な文字列を除く | |
data_setlist <- data_setlist %>% rename(title=X1) | |
data_setlist$title <- data_setlist$title %>% str_replace_all("\n購入する.+", "") | |
data_setlist$encore <- grepl("^アンコール", data_setlist$title) | |
data_setlist$title <- data_setlist$title %>% str_replace_all("アンコール.+:\n", "") | |
data_setlist$title <- data_setlist$title %>% str_replace_all("アンコール:\n", "") | |
# 日付を処理 | |
data_setlist$date <- data_setlist$date %>% str_extract_all("[0-9]{4}/[0-9]{2}/[0-9]{2}?", | |
simplify = TRUE) %>% | |
as.Date() | |
data_setlist$year <- as.numeric(format(data_setlist$date, "%Y")) | |
# 不要なデータの除去 | |
data_setlist <- data_setlist %>% filter(livename != "a-nation island 坂崎幸之助のお台場フォーク村 10周年 第36夜 帰ってきたともえちゃんフォークジャンボリー") | |
# 一部データの修正 | |
data_setlist$livename[grepl("Rainbow Night~虹", data_setlist$livename)] <- "30周年記念コンサート ~Rainbow Night~虹" | |
data_setlist$title[grepl("北の国から", data_setlist$title)&!grepl("メドレー", data_setlist$title)] <- "北の国から" | |
# 曲順の追加 | |
data_setlist <- data_setlist %>% group_by(livename, date) %>% mutate(order=seq(length(livename))) | |
## 何曲選ぶべきか | |
res_count <- data_setlist %>% count(livename, year, date) | |
ggplot(res_count, aes(x=factor(year), y=n)) + geom_boxplot() + labs(x="") | |
## 2005年以降のtop20 | |
# livename単位でランダムにサンプリングし頻度を集計 | |
set.seed(1) | |
data_ref_date <- data_setlist %>% group_by(livename) %>% summarise(date = sample(date,1)) | |
data_setlist_ref <- data_setlist %>% semi_join(data_ref_date) | |
num_livename <- data_setlist_ref %>% filter(year>=2005) %>% count(livename) %>% nrow | |
res_top20 <- data_setlist_ref %>% filter(year>=2005) %>% count(title) %>% mutate(per=round(100*n/num_livename, 1)) %>% | |
arrange(desc(n)) %>% head(20) | |
# 曲順として先頭によく来る曲は何か | |
res_first_top5 <- data_setlist_ref %>% filter(year>=2005, order==1) %>% count(title) %>% mutate(per=round(100*n/num_livename, 1)) %>% | |
arrange(desc(n)) %>% head(5) | |
# 曲順として最後によく来る曲は何か | |
res_last_top5 <- data_setlist_ref %>% | |
filter(year>=2005, !encore) %>% group_by(livename, date) %>% | |
filter(order==max(order)) %>% | |
count(title) %>% mutate(per=round(100*n/num_livename, 1)) %>% | |
arrange(desc(n)) %>% head(5) | |
# アンコールの曲数:大体1曲だが3曲もある | |
res_encore_count <- data_setlist %>% filter(encore) %>% | |
count(livename, year, date) | |
ggplot(res_encore_count, aes(x=factor(year), y=n)) + geom_boxplot() + labs(x="") | |
# 2005年以降のtop10アンコールでは何が歌われているか | |
# これは毎回異なる可能性が高いのでサンプリングしない | |
res_encore_top5 <- data_setlist %>% filter(year>=2005, encore) %>% | |
count(title) %>% arrange(desc(n)) %>% head(5) | |
# 有向グラフを描く | |
tmp <- data_setlist_ref %>% filter(!encore) %>% select(livename, title, order) %>% mutate(order=order-1) | |
data_setlist_ref2 <- data_setlist_ref %>% filter(!encore) %>% | |
inner_join(tmp,by=c("livename","date", "order")) | |
matrix_title <- data_setlist_ref2 %>% | |
count(title.x, title.y) %>% | |
group_by(title.x) %>% mutate(all=sum(n), per=round(100*n/all, 1)) | |
matrix_title_sub <- matrix_title %>% filter(n>=3) # 3回以上出現している組み合わせに絞る | |
library("DiagrammeR") | |
sada_nodes <- create_nodes(nodes=unique(c(matrix_title_sub$title.x, | |
matrix_title_sub$title.y)), | |
style="filled", | |
fontcolor="black", | |
fillcolor="white") | |
sada_nodes$fillcolor[sada_nodes$nodes=="案山子"] <- "red" # 最初に出現しやすい案山子に赤 | |
sada_nodes$fillcolor[sada_nodes$nodes=="修二会"] <- "blue" # 最後に出現しやすい修二会に青 | |
sada_nodes$fontcolor[sada_nodes$nodes=="関白宣言"] <- "red" # 結果の強調 | |
sada_nodes$fontcolor[sada_nodes$nodes=="関白失脚"] <- "red" # 結果の強調 | |
sada_edges <- create_edges(from=matrix_title_sub$title.x, | |
to=matrix_title_sub$title.y) | |
sada_graph <- create_graph(sada_nodes, sada_edges) | |
render_graph(sada_graph) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment