Skip to content

Instantly share code, notes, and snippets.

@Evanto
Created November 1, 2017 20:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Evanto/96784b348ca38161960da94f9fc8fe86 to your computer and use it in GitHub Desktop.
Save Evanto/96784b348ca38161960da94f9fc8fe86 to your computer and use it in GitHub Desktop.
Download captions from YouTube videos
# ABOUT -------------------------------------------------------------------
# Author: Caio Lente (@ctlente)
# Date: 2017-06-14
# Descrption: Download captions from YouTube videos
# Captions by: http://ccsubs.com
# CODE --------------------------------------------------------------------
#' Convert from HTML to text
#'
#' @param str HTML string to convert
#'
#' @return A text string
unescape_xml <- function(str){
xml2::xml_text(xml2::read_xml(paste0("<x>", str, "</x>")))
}
#' Get captions for a specific video ID
#'
#' @param id Video ID (`https://www.youtube.com/watch?v={VIDEO_ID}`)
#'
#' @return A tibble with `id`, `timecode_in`, `timecode_out`, and `text`
get_video_captions <- function(id) {
# Temporary file
temp <- tempfile(fileext = ".srt")
# Send GET to load video on website
load_query <- stringr::str_c("http://ccsubs.com/fetch?id=yt:", id)
httr::GET(load_query, httr::write_disk(temp, overwrite = TRUE))
# Get url for download request
url <- readr::read_lines(temp) %>%
stringr::str_replace_all("\\\\", "") %>%
stringr::str_split("/")
url <- url[[1]][4]
# Create download request
down_query <- stringr::str_c(
"http://ccsubs.com/video/yt:", id, "/", url,
"/download?format=srt&lang=en")
# Download file and read captions
download.file(down_query, temp, quiet = TRUE)
captions <- subtools::read.subtitles(temp) %>%
.$subtitles %>% tibble::as_tibble() %>%
dplyr::rowwise() %>% dplyr::mutate(Text = unescape_xml(Text))
# Repair names
names(captions) <- names(captions) %>%
tolower() %>% stringr::str_replace("\\.", "_")
names(captions)[1] <- "n"
return(captions)
}
#' Get list of videos published by a channel
#'
#' @param channel_id Channel ID
#' (`https://www.youtube.com/channel/{CHANNEL_ID}`)
#' @param key Your API key
#' (\url{https://developers.google.com/youtube/v3/getting-started})
#' @param max_pages Maximum number of pages to get (each page contains
#' 20 videos)
#'
#' @return A tibble with `title`, `id`, and `link`
get_channel_videos <- function(channel_id, key, max_pages) {
library(magrittr)
# Temporary file
temp <- tempfile()
# Get list of video IDs in channel
ids <- c(); titles <- c(); page_token <- ""
for (i in 1:max_pages) {
# Query to get videos from channel
videos_query <- stringr::str_c(
"https://www.googleapis.com/youtube/v3/search?",
"key=", key,
"&channelId=", channel_id,
"&part=snippet,id",
"&order=date",
"&maxResults=20",
ifelse(page_token == "", "",
stringr::str_c("&pageToken=", page_token)))
# Get json with response
httr::GET(videos_query, httr::write_disk(temp, overwrite = TRUE))
json <- jsonlite::read_json(temp)
# Get nextPageToken and IDs
page_token <- json$nextPageToken
titles <- purrr::map(json$items, ~.x$snippet) %>%
purrr::map_chr(~.x$title) %>%
c(titles)
ids <- purrr::map(json$items, ~.x$id) %>%
purrr::map_chr(~.x$videoId) %>%
c(ids)
}
# Convert IDs to links
links <- stringr::str_c("https://www.youtube.com/watch?v=", ids)
# Create tibble with video information
videos <- tibble::tibble(title = titles, id = ids, link = links)
return(videos)
}
#' Get captions of videos published by a channel
#'
#' @param channel_id Channel ID
#' (`https://www.youtube.com/channel/{CHANNEL_ID}`)
#' @param key Your API key
#' (\url{https://developers.google.com/youtube/v3/getting-started})
#' @param max_pages Maximum number of pages to get (each page contains
#' 50 videos)
#' @param as_text Whether to convert captions to a single text string
#' @param progress Whether to display progress of captcha collection
#'
#' @return A tibble with `title`, `id`, `link`, and `captions`
get_channel_captions <- function(channel_id, key, max_pages = 5,
as_text = FALSE, progress = TRUE) {
# Create safe version of get_video_captions
n <- timecode_in <- timecode_out <- text <- "error"
err <- tibble::tibble(n, timecode_in, timecode_out, text)
get_video_captions <- purrr::possibly(get_video_captions, list(err))
# Get list of videos
videos <- get_channel_videos(channel_id, key, max_pages)
# Declare progress bar
if (progress) {
message("Collecting captions")
bar <- txtProgressBar(style = 3)
}
# Loop over IDs and get captions
captions <- list()
for (i in seq_along(videos$id)) {
captions <- append(captions, list(get_video_captions(videos$id[i])))
if (progress) { setTxtProgressBar(bar, i/length(videos$id)) }
}
# Convert captions to text
if (as_text) {
paste <- purrr::possibly(paste, "error")
captions <- purrr::map_chr(captions, ~paste0(.x$text, collapse = " "))
}
# Add captions to videos table
videos <- dplyr::mutate(videos, captions = captions)
return(videos)
}
# EXAMPLE -----------------------------------------------------------------
# Load magrittr
library(magrittr)
# Get captions from a video
ex1 <- get_video_captions("_wMvItIDMPU")
# Get captions from many videos of the same channel
ex2 <- get_channel_captions("UCMtFAi84ehTSYSE9XoHefig", "YOUR_API_KEY")
# Take a look at the results
dplyr::glimpse(ex1)
#> Observations: 69
#> Variables: 4
#> $ n <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", ...
#> $ timecode_in <chr> "00:00:00.000", "00:00:00.000", "00:00:02.000...
#> $ timecode_out <chr> "00:00:05.000", "00:00:02.000", "00:00:02.000...
#> $ text <chr> "Downloaded from ccSubs.com", "( LAUGHTER ) B...
dplyr::glimpse(ex2)
#> Observations: 100
#> Variables: 4
#> $ title <chr> "Stephen Commends Manchester's First Responders",...
#> $ id <chr> "wkcpNKz5Xuc", "O7YKdjb5eBU", "-hBdxychbtE", "Uk3...
#> $ link <chr> "https://www.youtube.com/watch?v=wkcpNKz5Xuc", "h...
#> $ captions <list> [<# A tibble: 19 x 4, n timecode_in time...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment