Skip to content

Instantly share code, notes, and snippets.

@kellenmace
Last active October 2, 2023 16:20
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kellenmace/bf1f7455c4ebbe0e583dc02c8c712855 to your computer and use it in GitHub Desktop.
Save kellenmace/bf1f7455c4ebbe0e583dc02c8c712855 to your computer and use it in GitHub Desktop.
Get YouTube Video Transcript in JavaScript
const fetch = require("node-fetch");
const xml2js = require("xml2js");
const he = require("he");
const TRANSCRIPTION_CHAR_KEY = "transcription";
// Test
(async () => {
const videoId = "ht14hTTDklA"; // HWPR Pagination video
// const videoId = "rB9ql0L0cUQ"; // Video with manually added captions
// const videoId = "YgT6XABqS5Y"; // Video without any captions
try {
const transcript = await getTranscript(videoId);
console.log(JSON.stringify(transcript, null, 2));
} catch (error) {
console.log("error was caught!");
console.error(error.message);
}
})();
async function getTranscript() {
const videoId = "ht14hTTDklA"; // HWPR Pagination video
// const videoId = "rB9ql0L0cUQ"; // Video with manually added captions
// const videoId = "YgT6XABqS5Y"; // Video without any captions
try {
const videoPageHtml = await (
await fetch(`https://www.youtube.com/watch?v=${videoId}`)
).text();
return await fetchTranscript(videoPageHtml);
} catch {
// No transcript available for this video, or we were unable to fetch it.
return null;
}
}
async function fetchTranscript(videoPageHtml) {
const startString = `"captions":`;
const captionsStartPosition = videoPageHtml.indexOf(startString);
const wasCaptionStartFound = captionsStartPosition !== -1;
if (!wasCaptionStartFound) {
// No error log email is sent here, since videos may be too new to
// have a transcript yet, or just not have a transcript available.
throw new Error("Unable to find beginning of captions data.");
}
const trimmedHtml = videoPageHtml.substr(
captionsStartPosition + startString.length
);
const endString = `,"videoDetails`;
const captionsEndPosition = trimmedHtml.indexOf(endString);
const wasCaptionEndFound = captionsEndPosition !== -1;
if (!wasCaptionEndFound) {
throw new Error("Unable to find end of captions data.");
}
const captionsDataString = trimmedHtml.substring(0, captionsEndPosition);
const captionsData = JSON.parse(captionsDataString);
const englishCaptionTrack =
captionsData?.playerCaptionsTracklistRenderer?.captionTracks.find(
(captionTrack) => captionTrack.languageCode === "en"
);
if (!englishCaptionTrack) {
throw new Error("Unable to find english caption track.");
}
const captionsUrl = englishCaptionTrack.baseUrl;
if (!captionsUrl) {
throw new Error("Unable to extract captions URL.");
}
const captionsXml = await (await fetch(captionsUrl)).text();
const transcriptObject = await xml2js.parseStringPromise(captionsXml, {
attrkey: "attributes",
charkey: TRANSCRIPTION_CHAR_KEY,
});
const transcript = transcriptObject?.transcript?.text;
if (!transcript) return null;
return transcript.map(formatTranscriptText);
}
function formatTranscriptText(transcriptLine) {
const decodeHtmlEntities = (text) => he.decode(text);
const removeLineBreaks = (text) => text.replaceAll("\n", " ");
return {
...transcriptLine,
[TRANSCRIPTION_CHAR_KEY]: removeLineBreaks(
decodeHtmlEntities(transcriptLine[TRANSCRIPTION_CHAR_KEY])
),
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment