Evanto/get_captions.R

## get_captions.R

# ABOUT -------------------------------------------------------------------

# Author: Caio Lente (@ctlente)
# Date: 2017-06-14
# Descrption: Download captions from YouTube videos
# Captions by: http://ccsubs.com


# CODE --------------------------------------------------------------------

#' Convert from HTML to text
#'
#' @param str HTML string to convert
#'
#' @return A text string
unescape_xml <- function(str){
  xml2::xml_text(xml2::read_xml(paste0("<x>", str, "</x>")))
}

#' Get captions for a specific video ID
#'
#' @param id Video ID (`https://www.youtube.com/watch?v={VIDEO_ID}`)
#'
#' @return A tibble with `id`, `timecode_in`, `timecode_out`, and `text`
get_video_captions <- function(id) {

  # Temporary file
  temp <- tempfile(fileext = ".srt")

  # Send GET to load video on website
  load_query <- stringr::str_c("http://ccsubs.com/fetch?id=yt:", id)
  httr::GET(load_query, httr::write_disk(temp, overwrite = TRUE))

  # Get url for download request
  url <- readr::read_lines(temp) %>%
    stringr::str_replace_all("\\\\", "") %>%
    stringr::str_split("/")
  url <- url[[1]][4]

  # Create download request
  down_query <- stringr::str_c(
    "http://ccsubs.com/video/yt:", id, "/", url,
    "/download?format=srt&lang=en")

  # Download file and read captions
  download.file(down_query, temp, quiet = TRUE)
  captions <- subtools::read.subtitles(temp) %>%
    .$subtitles %>% tibble::as_tibble() %>%
    dplyr::rowwise() %>% dplyr::mutate(Text = unescape_xml(Text))

  # Repair names
  names(captions) <- names(captions) %>%
    tolower() %>% stringr::str_replace("\\.", "_")
  names(captions)[1] <- "n"

  return(captions)
}

#' Get list of videos published by a channel
#'
#' @param channel_id Channel ID
#' (`https://www.youtube.com/channel/{CHANNEL_ID}`)
#' @param key Your API key
#' (\url{https://developers.google.com/youtube/v3/getting-started})
#' @param max_pages Maximum number of pages to get (each page contains
#' 20 videos)
#'
#' @return A tibble with `title`, `id`, and `link`
get_channel_videos <- function(channel_id, key, max_pages) {
  library(magrittr)

  # Temporary file
  temp <- tempfile()

  # Get list of video IDs in channel
  ids <- c(); titles <- c(); page_token <- ""
  for (i in 1:max_pages) {

    # Query to get videos from channel
    videos_query <- stringr::str_c(
      "https://www.googleapis.com/youtube/v3/search?",
      "key=", key,
      "&channelId=", channel_id,
      "&part=snippet,id",
      "&order=date",
      "&maxResults=20",
      ifelse(page_token == "", "",
             stringr::str_c("&pageToken=", page_token)))

    # Get json with response
    httr::GET(videos_query, httr::write_disk(temp, overwrite = TRUE))
    json <- jsonlite::read_json(temp)

    # Get nextPageToken and IDs
    page_token <- json$nextPageToken
    titles <- purrr::map(json$items, ~.x$snippet) %>%
      purrr::map_chr(~.x$title) %>%
      c(titles)
    ids <- purrr::map(json$items, ~.x$id) %>%
      purrr::map_chr(~.x$videoId) %>%
      c(ids)
  }

  # Convert IDs to links
  links <- stringr::str_c("https://www.youtube.com/watch?v=", ids)

  # Create tibble with video information
  videos <- tibble::tibble(title = titles, id = ids, link = links)

  return(videos)
}

#' Get captions of videos published by a channel
#'
#' @param channel_id Channel ID
#' (`https://www.youtube.com/channel/{CHANNEL_ID}`)
#' @param key Your API key
#' (\url{https://developers.google.com/youtube/v3/getting-started})
#' @param max_pages Maximum number of pages to get (each page contains
#' 50 videos)
#' @param as_text Whether to convert captions to a single text string
#' @param progress Whether to display progress of captcha collection
#'
#' @return A tibble with `title`, `id`, `link`, and `captions`
get_channel_captions <- function(channel_id, key, max_pages = 5,
                                 as_text = FALSE, progress = TRUE) {

  # Create safe version of get_video_captions
  n <- timecode_in <- timecode_out <- text <- "error"
  err <- tibble::tibble(n, timecode_in, timecode_out, text)
  get_video_captions <- purrr::possibly(get_video_captions, list(err))

  # Get list of videos
  videos <- get_channel_videos(channel_id, key, max_pages)

  # Declare progress bar
  if (progress) {
    message("Collecting captions")
    bar <- txtProgressBar(style = 3)
  }

  # Loop over IDs and get captions
  captions <- list()
  for (i in seq_along(videos$id)) {
    captions <- append(captions, list(get_video_captions(videos$id[i])))
    if (progress) { setTxtProgressBar(bar, i/length(videos$id)) }
  }

  # Convert captions to text
  if (as_text) {
    paste <- purrr::possibly(paste, "error")
    captions <- purrr::map_chr(captions, ~paste0(.x$text, collapse = " "))
  }

  # Add captions to videos table
  videos <- dplyr::mutate(videos, captions = captions)

  return(videos)
}


# EXAMPLE -----------------------------------------------------------------

# Load magrittr
library(magrittr)

# Get captions from a video
ex1 <- get_video_captions("_wMvItIDMPU")

# Get captions from many videos of the same channel
ex2 <- get_channel_captions("UCMtFAi84ehTSYSE9XoHefig", "YOUR_API_KEY")

# Take a look at the results
dplyr::glimpse(ex1)
#> Observations: 69
#> Variables: 4
#> $ n            <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", ...
#> $ timecode_in  <chr> "00:00:00.000", "00:00:00.000", "00:00:02.000...
#> $ timecode_out <chr> "00:00:05.000", "00:00:02.000", "00:00:02.000...
#> $ text         <chr> "Downloaded from ccSubs.com", "( LAUGHTER ) B...
dplyr::glimpse(ex2)
#> Observations: 100
#> Variables: 4
#> $ title    <chr> "Stephen Commends Manchester's First Responders",...
#> $ id       <chr> "wkcpNKz5Xuc", "O7YKdjb5eBU", "-hBdxychbtE", "Uk3...
#> $ link     <chr> "https://www.youtube.com/watch?v=wkcpNKz5Xuc", "h...
#> $ captions <list> [<# A tibble: 19 x 4,        n  timecode_in time...

	# ABOUT -------------------------------------------------------------------

	# Author: Caio Lente (@ctlente)
	# Date: 2017-06-14
	# Descrption: Download captions from YouTube videos
	# Captions by: http://ccsubs.com



	# CODE --------------------------------------------------------------------

	#' Convert from HTML to text
	#'
	#' @param str HTML string to convert
	#'
	#' @return A text string
	unescape_xml <- function(str){
	xml2::xml_text(xml2::read_xml(paste0("<x>", str, "</x>")))
	}

	#' Get captions for a specific video ID
	#'
	#' @param id Video ID (`https://www.youtube.com/watch?v={VIDEO_ID}`)
	#'
	#' @return A tibble with `id`, `timecode_in`, `timecode_out`, and `text`
	get_video_captions <- function(id) {

	# Temporary file
	temp <- tempfile(fileext = ".srt")

	# Send GET to load video on website
	load_query <- stringr::str_c("http://ccsubs.com/fetch?id=yt:", id)
	httr::GET(load_query, httr::write_disk(temp, overwrite = TRUE))

	# Get url for download request
	url <- readr::read_lines(temp) %>%
	stringr::str_replace_all("\\\\", "") %>%
	stringr::str_split("/")
	url <- url[[1]][4]

	# Create download request
	down_query <- stringr::str_c(
	"http://ccsubs.com/video/yt:", id, "/", url,
	"/download?format=srt&lang=en")

	# Download file and read captions
	download.file(down_query, temp, quiet = TRUE)
	captions <- subtools::read.subtitles(temp) %>%
	.$subtitles %>% tibble::as_tibble() %>%
	dplyr::rowwise() %>% dplyr::mutate(Text = unescape_xml(Text))

	# Repair names
	names(captions) <- names(captions) %>%
	tolower() %>% stringr::str_replace("\\.", "_")
	names(captions)[1] <- "n"

	return(captions)
	}

	#' Get list of videos published by a channel
	#'
	#' @param channel_id Channel ID
	#' (`https://www.youtube.com/channel/{CHANNEL_ID}`)
	#' @param key Your API key
	#' (\url{https://developers.google.com/youtube/v3/getting-started})
	#' @param max_pages Maximum number of pages to get (each page contains
	#' 20 videos)
	#'
	#' @return A tibble with `title`, `id`, and `link`
	get_channel_videos <- function(channel_id, key, max_pages) {
	library(magrittr)

	# Temporary file
	temp <- tempfile()

	# Get list of video IDs in channel
	ids <- c(); titles <- c(); page_token <- ""
	for (i in 1:max_pages) {

	# Query to get videos from channel
	videos_query <- stringr::str_c(
	"https://www.googleapis.com/youtube/v3/search?",
	"key=", key,
	"&channelId=", channel_id,
	"&part=snippet,id",
	"&order=date",
	"&maxResults=20",
	ifelse(page_token == "", "",
	stringr::str_c("&pageToken=", page_token)))

	# Get json with response
	httr::GET(videos_query, httr::write_disk(temp, overwrite = TRUE))
	json <- jsonlite::read_json(temp)

	# Get nextPageToken and IDs
	page_token <- json$nextPageToken
	titles <- purrr::map(json$items, ~.x$snippet) %>%
	purrr::map_chr(~.x$title) %>%
	c(titles)
	ids <- purrr::map(json$items, ~.x$id) %>%
	purrr::map_chr(~.x$videoId) %>%
	c(ids)
	}

	# Convert IDs to links
	links <- stringr::str_c("https://www.youtube.com/watch?v=", ids)

	# Create tibble with video information
	videos <- tibble::tibble(title = titles, id = ids, link = links)

	return(videos)
	}

	#' Get captions of videos published by a channel
	#'
	#' @param channel_id Channel ID
	#' (`https://www.youtube.com/channel/{CHANNEL_ID}`)
	#' @param key Your API key
	#' (\url{https://developers.google.com/youtube/v3/getting-started})
	#' @param max_pages Maximum number of pages to get (each page contains
	#' 50 videos)
	#' @param as_text Whether to convert captions to a single text string
	#' @param progress Whether to display progress of captcha collection
	#'
	#' @return A tibble with `title`, `id`, `link`, and `captions`
	get_channel_captions <- function(channel_id, key, max_pages = 5,
	as_text = FALSE, progress = TRUE) {

	# Create safe version of get_video_captions
	n <- timecode_in <- timecode_out <- text <- "error"
	err <- tibble::tibble(n, timecode_in, timecode_out, text)
	get_video_captions <- purrr::possibly(get_video_captions, list(err))

	# Get list of videos
	videos <- get_channel_videos(channel_id, key, max_pages)

	# Declare progress bar
	if (progress) {
	message("Collecting captions")
	bar <- txtProgressBar(style = 3)
	}

	# Loop over IDs and get captions
	captions <- list()
	for (i in seq_along(videos$id)) {
	captions <- append(captions, list(get_video_captions(videos$id[i])))
	if (progress) { setTxtProgressBar(bar, i/length(videos$id)) }
	}

	# Convert captions to text
	if (as_text) {
	paste <- purrr::possibly(paste, "error")
	captions <- purrr::map_chr(captions, ~paste0(.x$text, collapse = " "))
	}

	# Add captions to videos table
	videos <- dplyr::mutate(videos, captions = captions)

	return(videos)
	}



	# EXAMPLE -----------------------------------------------------------------

	# Load magrittr
	library(magrittr)

	# Get captions from a video
	ex1 <- get_video_captions("_wMvItIDMPU")

	# Get captions from many videos of the same channel
	ex2 <- get_channel_captions("UCMtFAi84ehTSYSE9XoHefig", "YOUR_API_KEY")

	# Take a look at the results
	dplyr::glimpse(ex1)
	#> Observations: 69
	#> Variables: 4
	#> $ n <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", ...
	#> $ timecode_in <chr> "00:00:00.000", "00:00:00.000", "00:00:02.000...
	#> $ timecode_out <chr> "00:00:05.000", "00:00:02.000", "00:00:02.000...
	#> $ text <chr> "Downloaded from ccSubs.com", "( LAUGHTER ) B...
	dplyr::glimpse(ex2)
	#> Observations: 100
	#> Variables: 4
	#> $ title <chr> "Stephen Commends Manchester's First Responders",...
	#> $ id <chr> "wkcpNKz5Xuc", "O7YKdjb5eBU", "-hBdxychbtE", "Uk3...
	#> $ link <chr> "https://www.youtube.com/watch?v=wkcpNKz5Xuc", "h...
	#> $ captions <list> [<# A tibble: 19 x 4, n timecode_in time...