|
|
|
# ABOUT ------------------------------------------------------------------- |
|
|
|
# Author: Caio Lente (@ctlente) |
|
# Date: 2017-06-14 |
|
# Descrption: Download captions from YouTube videos |
|
# Captions by: http://ccsubs.com |
|
|
|
|
|
|
|
# CODE -------------------------------------------------------------------- |
|
|
|
#' Convert from HTML to text |
|
#' |
|
#' @param str HTML string to convert |
|
#' |
|
#' @return A text string |
|
unescape_xml <- function(str){ |
|
xml2::xml_text(xml2::read_xml(paste0("<x>", str, "</x>"))) |
|
} |
|
|
|
#' Get captions for a specific video ID |
|
#' |
|
#' @param id Video ID (`https://www.youtube.com/watch?v={VIDEO_ID}`) |
|
#' |
|
#' @return A tibble with `id`, `timecode_in`, `timecode_out`, and `text` |
|
get_video_captions <- function(id) { |
|
|
|
# Temporary file |
|
temp <- tempfile(fileext = ".srt") |
|
|
|
# Send GET to load video on website |
|
load_query <- stringr::str_c("http://ccsubs.com/fetch?id=yt:", id) |
|
httr::GET(load_query, httr::write_disk(temp, overwrite = TRUE)) |
|
|
|
# Get url for download request |
|
url <- readr::read_lines(temp) %>% |
|
stringr::str_replace_all("\\\\", "") %>% |
|
stringr::str_split("/") |
|
url <- url[[1]][4] |
|
|
|
# Create download request |
|
down_query <- stringr::str_c( |
|
"http://ccsubs.com/video/yt:", id, "/", url, |
|
"/download?format=srt&lang=en") |
|
|
|
# Download file and read captions |
|
download.file(down_query, temp, quiet = TRUE) |
|
captions <- subtools::read.subtitles(temp) %>% |
|
.$subtitles %>% tibble::as_tibble() %>% |
|
dplyr::rowwise() %>% dplyr::mutate(Text = unescape_xml(Text)) |
|
|
|
# Repair names |
|
names(captions) <- names(captions) %>% |
|
tolower() %>% stringr::str_replace("\\.", "_") |
|
names(captions)[1] <- "n" |
|
|
|
return(captions) |
|
} |
|
|
|
#' Get list of videos published by a channel |
|
#' |
|
#' @param channel_id Channel ID |
|
#' (`https://www.youtube.com/channel/{CHANNEL_ID}`) |
|
#' @param key Your API key |
|
#' (\url{https://developers.google.com/youtube/v3/getting-started}) |
|
#' @param max_pages Maximum number of pages to get (each page contains |
|
#' 20 videos) |
|
#' |
|
#' @return A tibble with `title`, `id`, and `link` |
|
get_channel_videos <- function(channel_id, key, max_pages) { |
|
library(magrittr) |
|
|
|
# Temporary file |
|
temp <- tempfile() |
|
|
|
# Get list of video IDs in channel |
|
ids <- c(); titles <- c(); page_token <- "" |
|
for (i in 1:max_pages) { |
|
|
|
# Query to get videos from channel |
|
videos_query <- stringr::str_c( |
|
"https://www.googleapis.com/youtube/v3/search?", |
|
"key=", key, |
|
"&channelId=", channel_id, |
|
"&part=snippet,id", |
|
"&order=date", |
|
"&maxResults=20", |
|
ifelse(page_token == "", "", |
|
stringr::str_c("&pageToken=", page_token))) |
|
|
|
# Get json with response |
|
httr::GET(videos_query, httr::write_disk(temp, overwrite = TRUE)) |
|
json <- jsonlite::read_json(temp) |
|
|
|
# Get nextPageToken and IDs |
|
page_token <- json$nextPageToken |
|
titles <- purrr::map(json$items, ~.x$snippet) %>% |
|
purrr::map_chr(~.x$title) %>% |
|
c(titles) |
|
ids <- purrr::map(json$items, ~.x$id) %>% |
|
purrr::map_chr(~.x$videoId) %>% |
|
c(ids) |
|
} |
|
|
|
# Convert IDs to links |
|
links <- stringr::str_c("https://www.youtube.com/watch?v=", ids) |
|
|
|
# Create tibble with video information |
|
videos <- tibble::tibble(title = titles, id = ids, link = links) |
|
|
|
return(videos) |
|
} |
|
|
|
#' Get captions of videos published by a channel |
|
#' |
|
#' @param channel_id Channel ID |
|
#' (`https://www.youtube.com/channel/{CHANNEL_ID}`) |
|
#' @param key Your API key |
|
#' (\url{https://developers.google.com/youtube/v3/getting-started}) |
|
#' @param max_pages Maximum number of pages to get (each page contains |
|
#' 50 videos) |
|
#' @param as_text Whether to convert captions to a single text string |
|
#' @param progress Whether to display progress of captcha collection |
|
#' |
|
#' @return A tibble with `title`, `id`, `link`, and `captions` |
|
get_channel_captions <- function(channel_id, key, max_pages = 5, |
|
as_text = FALSE, progress = TRUE) { |
|
|
|
# Create safe version of get_video_captions |
|
n <- timecode_in <- timecode_out <- text <- "error" |
|
err <- tibble::tibble(n, timecode_in, timecode_out, text) |
|
get_video_captions <- purrr::possibly(get_video_captions, list(err)) |
|
|
|
# Get list of videos |
|
videos <- get_channel_videos(channel_id, key, max_pages) |
|
|
|
# Declare progress bar |
|
if (progress) { |
|
message("Collecting captions") |
|
bar <- txtProgressBar(style = 3) |
|
} |
|
|
|
# Loop over IDs and get captions |
|
captions <- list() |
|
for (i in seq_along(videos$id)) { |
|
captions <- append(captions, list(get_video_captions(videos$id[i]))) |
|
if (progress) { setTxtProgressBar(bar, i/length(videos$id)) } |
|
} |
|
|
|
# Convert captions to text |
|
if (as_text) { |
|
paste <- purrr::possibly(paste, "error") |
|
captions <- purrr::map_chr(captions, ~paste0(.x$text, collapse = " ")) |
|
} |
|
|
|
# Add captions to videos table |
|
videos <- dplyr::mutate(videos, captions = captions) |
|
|
|
return(videos) |
|
} |
|
|
|
|
|
|
|
# EXAMPLE ----------------------------------------------------------------- |
|
|
|
# Load magrittr |
|
library(magrittr) |
|
|
|
# Get captions from a video |
|
ex1 <- get_video_captions("_wMvItIDMPU") |
|
|
|
# Get captions from many videos of the same channel |
|
ex2 <- get_channel_captions("UCMtFAi84ehTSYSE9XoHefig", "YOUR_API_KEY") |
|
|
|
# Take a look at the results |
|
dplyr::glimpse(ex1) |
|
#> Observations: 69 |
|
#> Variables: 4 |
|
#> $ n <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", ... |
|
#> $ timecode_in <chr> "00:00:00.000", "00:00:00.000", "00:00:02.000... |
|
#> $ timecode_out <chr> "00:00:05.000", "00:00:02.000", "00:00:02.000... |
|
#> $ text <chr> "Downloaded from ccSubs.com", "( LAUGHTER ) B... |
|
dplyr::glimpse(ex2) |
|
#> Observations: 100 |
|
#> Variables: 4 |
|
#> $ title <chr> "Stephen Commends Manchester's First Responders",... |
|
#> $ id <chr> "wkcpNKz5Xuc", "O7YKdjb5eBU", "-hBdxychbtE", "Uk3... |
|
#> $ link <chr> "https://www.youtube.com/watch?v=wkcpNKz5Xuc", "h... |
|
#> $ captions <list> [<# A tibble: 19 x 4, n timecode_in time... |