Last active
December 31, 2020 22:15
-
-
Save acropup/1a8115d2bd61323bfc7850d8e2155377 to your computer and use it in GitHub Desktop.
Capture the caption text of a youtube video as the video plays. The entire transcript can also be obtained through a single HTTP request, so read the notes at the bottom for details.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Capture caption text into the allCaptions array as the video plays. See bottom of script for notes. | |
let oldCaptionText = ""; | |
let allCaptions = []; | |
let videoPlayer = document.querySelector(".html5-video-player"); | |
let config = { childList: true }; | |
let captureCaption = function(mutationsList, observer) { | |
let captionNode = videoPlayer.querySelector(".caption-window"); | |
let newCaptionText = captionNode?.textContent?.trim() || ""; | |
if (newCaptionText != oldCaptionText) { | |
oldCaptionText = newCaptionText; | |
if (newCaptionText.length > 0) { | |
console.log(newCaptionText); | |
allCaptions.push(newCaptionText); | |
} | |
} | |
}; | |
let observer = new MutationObserver(captureCaption); | |
observer.observe(videoPlayer, config); | |
// The ".caption-window" Element is and added removed every time a | |
// caption is displayed, so we're using a MutationObserver on its parent | |
// to monitor these events. | |
/* | |
This script was more an exercise in MutationObserver than anything, | |
as you can get the entire caption script in a single HTTP request. | |
However, the above solution may still be useful for auto-generated | |
captions, or other situations where the technique below doesn't work. | |
Example HTTP requests: | |
Get the English (Ireland) caption text with timecodes, in JSON format: | |
https://www.youtube.com/api/timedtext?v=vQesgAtr2e4&lang=en-IE&fmt=json3 | |
Get the standard English caption text in simple XML format: | |
https://www.youtube.com/api/timedtext?v=vQesgAtr2e4&lang=en | |
Here is custom CSS to format the XML version nicely, including showing timecodes: | |
text { | |
position: relative; | |
display: block; | |
margin-left: 90px; | |
margin-right: 90px; | |
} | |
text::before, | |
text::after { | |
position: absolute; | |
min-width: 90px; | |
top: 0px; | |
height: 100%; | |
font-family: Cambria, sans-serif; | |
} | |
text::before { | |
content: "(" attr(start) "s)"; | |
left: -90px; | |
} | |
text::after { | |
content: "(" attr(dur) "s)"; | |
right: -90px; | |
font-style: italic; | |
} | |
text:nth-child(even), | |
text:nth-child(even)::before, | |
text:nth-child(even)::after { | |
background-color: #e8f2ff; | |
} | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment