Skip to content

Instantly share code, notes, and snippets.

@tetlabo
Last active December 29, 2022 08:27
Show Gist options
  • Save tetlabo/a5b2ded77b41cbce7ba51f349865c3c3 to your computer and use it in GitHub Desktop.
Save tetlabo/a5b2ded77b41cbce7ba51f349865c3c3 to your computer and use it in GitHub Desktop.
歌ネットから歌詞をスクレイピングする
## 参考: 【GoogleColaboratory】歌ネット(Uta-Net)から歌詞をスクレイピングする https://zenn.dev/robes/articles/00e86185677fb5
library(tidyverse)
library(httr)
library(rvest)
base_url <- "https://www.uta-net.com"
#urls <- "https://www.uta-net.com/artist/6636/"
# とりあえずAKB48と乃木坂46と櫻坂46を選択 (オッサンには区別がつかないですが...)
urls <- c("https://www.uta-net.com/artist/6636/0/1/", "https://www.uta-net.com/artist/6636/0/2/", "https://www.uta-net.com/artist/12550/", "https://www.uta-net.com/artist/29512/")
# User Agentを偽装する
pseudo_user_agent <- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
# データフレームの初期化
df <- data.frame(id = numeric(), artist = character(), title = character(), lyric = character(), url = character())
for (i in 1:length(urls)) {
res <- GET(urls[i], user_agent(pseudo_user_agent))
html <- content(res)
links <- html %>% html_elements("td.sp-w-100")
for (link in links) {
href <- link %>% html_element("a") %>% html_attr("href")
song_url <- paste0(base_url, href, collapse = "")
song_res <- GET(song_url, user_agent(pseudo_user_agent))
song_html <- content(song_res)
song_title <- song_html %>% html_element("h2") %>% html_text2()
artist_name <- song_html %>% html_element("h3") %>% html_text2()
lyric <- song_html %>% html_element("div#kashi_area") %>% html_text2() %>% str_replace_all("\\n+", " ")
tmp_df <- data.frame(id = i, artist = artist_name, title = song_title, lyric = lyric, url = song_url)
df <- df %>% bind_rows(tmp_df)
print(paste0("now processing '", artist_name, "', '", song_title, "'"))
Sys.sleep(2)
}
}
write_csv(df, "pops_lyrics.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment