Last active
October 2, 2023 16:20
-
-
Save kellenmace/bf1f7455c4ebbe0e583dc02c8c712855 to your computer and use it in GitHub Desktop.
Get YouTube Video Transcript in JavaScript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fetch = require("node-fetch"); | |
const xml2js = require("xml2js"); | |
const he = require("he"); | |
const TRANSCRIPTION_CHAR_KEY = "transcription"; | |
// Test | |
(async () => { | |
const videoId = "ht14hTTDklA"; // HWPR Pagination video | |
// const videoId = "rB9ql0L0cUQ"; // Video with manually added captions | |
// const videoId = "YgT6XABqS5Y"; // Video without any captions | |
try { | |
const transcript = await getTranscript(videoId); | |
console.log(JSON.stringify(transcript, null, 2)); | |
} catch (error) { | |
console.log("error was caught!"); | |
console.error(error.message); | |
} | |
})(); | |
async function getTranscript() { | |
const videoId = "ht14hTTDklA"; // HWPR Pagination video | |
// const videoId = "rB9ql0L0cUQ"; // Video with manually added captions | |
// const videoId = "YgT6XABqS5Y"; // Video without any captions | |
try { | |
const videoPageHtml = await ( | |
await fetch(`https://www.youtube.com/watch?v=${videoId}`) | |
).text(); | |
return await fetchTranscript(videoPageHtml); | |
} catch { | |
// No transcript available for this video, or we were unable to fetch it. | |
return null; | |
} | |
} | |
async function fetchTranscript(videoPageHtml) { | |
const startString = `"captions":`; | |
const captionsStartPosition = videoPageHtml.indexOf(startString); | |
const wasCaptionStartFound = captionsStartPosition !== -1; | |
if (!wasCaptionStartFound) { | |
// No error log email is sent here, since videos may be too new to | |
// have a transcript yet, or just not have a transcript available. | |
throw new Error("Unable to find beginning of captions data."); | |
} | |
const trimmedHtml = videoPageHtml.substr( | |
captionsStartPosition + startString.length | |
); | |
const endString = `,"videoDetails`; | |
const captionsEndPosition = trimmedHtml.indexOf(endString); | |
const wasCaptionEndFound = captionsEndPosition !== -1; | |
if (!wasCaptionEndFound) { | |
throw new Error("Unable to find end of captions data."); | |
} | |
const captionsDataString = trimmedHtml.substring(0, captionsEndPosition); | |
const captionsData = JSON.parse(captionsDataString); | |
const englishCaptionTrack = | |
captionsData?.playerCaptionsTracklistRenderer?.captionTracks.find( | |
(captionTrack) => captionTrack.languageCode === "en" | |
); | |
if (!englishCaptionTrack) { | |
throw new Error("Unable to find english caption track."); | |
} | |
const captionsUrl = englishCaptionTrack.baseUrl; | |
if (!captionsUrl) { | |
throw new Error("Unable to extract captions URL."); | |
} | |
const captionsXml = await (await fetch(captionsUrl)).text(); | |
const transcriptObject = await xml2js.parseStringPromise(captionsXml, { | |
attrkey: "attributes", | |
charkey: TRANSCRIPTION_CHAR_KEY, | |
}); | |
const transcript = transcriptObject?.transcript?.text; | |
if (!transcript) return null; | |
return transcript.map(formatTranscriptText); | |
} | |
function formatTranscriptText(transcriptLine) { | |
const decodeHtmlEntities = (text) => he.decode(text); | |
const removeLineBreaks = (text) => text.replaceAll("\n", " "); | |
return { | |
...transcriptLine, | |
[TRANSCRIPTION_CHAR_KEY]: removeLineBreaks( | |
decodeHtmlEntities(transcriptLine[TRANSCRIPTION_CHAR_KEY]) | |
), | |
}; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment