Skip to content

Instantly share code, notes, and snippets.

@dmgig
Last active September 7, 2021 17:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dmgig/2f2b4483119061bc1d87920892465caa to your computer and use it in GitHub Desktop.
Save dmgig/2f2b4483119061bc1d87920892465caa to your computer and use it in GitHub Desktop.
Pull YouTube Subtitles

YouTube Subtitles to Pdf

Pull machine-generated transcript files from YouTube, convert them into something more readable, and then save as markdown and pdf files.

Example output: Witness to War YouTube Channel Transcripts.

Pull subtitles, annotations, and description from YouTube using youtube-dl.

You can pull a single video, a playlist, or an entire channel. If doing a channel, I find that using the video tab (producing /videos at the end of the channel url) works best.

youtube-dl \
  --skip-download # don't download the actual video \ 
  --write-auto-sub # grab the subtitles, will produce the .vtt file \
  --write-info-json # grab the metadata into .info.json file \
  --write-description # write video description to .description file \
  --output "_%(title)s.%(ext)s" # output location, filename template \
  --ignore-errors \
  https://www.youtube.com/channel/UC7X0LKts7k4FT498Id_x5sg/videos 

Parse vtt files, and produce markdown and pdfs

node \
subs2text.js \
/youtube/input/global
#!/usr/bin/env node
const fs = require('fs');
const yaml = require('js-yaml');
const path = require('path');
const padStart = require('string.prototype.padstart');
const slugify = require('slugify')
const glob = require("glob");
const { exec } = require("child_process");
var markdownpdf = require("markdown-pdf")
padStart.shim();
const INPUT = process.argv[2].replace(/\/$/, '');
const OUTPUT_OHH_DIR = INPUT.replace('input/', 'output/');
if(!fs.existsSync(OUTPUT_OHH_DIR)){
fs.mkdirSync(OUTPUT_OHH_DIR);
}
const OUTPUT_MD_DIR = INPUT.replace('input/', 'md/');
if(!fs.existsSync(OUTPUT_MD_DIR)){
fs.mkdirSync(OUTPUT_MD_DIR);
}
const OUTPUT_PDF_DIR = INPUT.replace('input/', 'pdf/');
if(!fs.existsSync(OUTPUT_PDF_DIR)){
fs.mkdirSync(OUTPUT_PDF_DIR);
}
/**
* WORK
*/
const makePdfFromVtt = function(vttFilepath){
const descriptionPath = vttFilepath.replace(/\.en\.vtt$/, '.description');
const info_jsonPath = vttFilepath.replace(/\.en\.vtt$/, '.info.json');
let vttData = fs.readFileSync(vttFilepath, 'utf8');
let description = '';
if(fs.existsSync(descriptionPath)){
description = fs.readFileSync(descriptionPath, 'utf8');
}
let info_json = null;
if(fs.existsSync(info_jsonPath)){
info_json = fs.readFileSync(info_jsonPath, 'utf8');
info_json = JSON.parse(info_json);
}
let d = info_json.upload_date.split('');
let date_for_file = d[0]+d[1]+d[2]+d[3]+'-'+d[4]+d[5]+'-'+d[6]+d[7];
let date_for_display = new Date(d[0]+d[1]+d[2]+d[3], d[4]+d[5], d[6]+d[7]).toDateString();
let info = {
upload_date: info_json.upload_date,
date_for_display: date_for_display,
fulltitle: info_json.fulltitle,
tags: info_json.tags,
id: info_json.id,
uploader: info_json.uploader,
channel_url: info_json.channel_url,
description: info_json.description,
webpage_url: info_json.webpage_url,
date: date_for_file
}
const storeArr = [];
let last_minutes = false;
const lines = vttData.split('\n');
lines.forEach((el, i) => {
// remove first 3 lines
if(i <= 2) return;
if(el.includes('-->')){
let matches = el.match(/([0-9]{2}\:[0-9]{2}\:[0-9]{2}\.[0-9]{3}) --> ([0-9]{2}\:[0-9]{2}\:[0-9]{2}\.[0-9]{3})/);
let minutes = matches[1].match(/([0-9]{2})\:([0-9]{2})\:([0-9]{2})\.[0-9]{3}/);
if(minutes[2] % 2 == 0 && minutes[2] != last_minutes){
let timestamp = minutes[1]+':'+minutes[2]+':'+minutes[3];
let seconds = (parseInt(minutes[1]) * 3600) + (parseInt(minutes[2]) * 60) + parseInt(minutes[3]);
storeArr.push(`\n\n<a href="${info.webpage_url}&t=${seconds}s">${timestamp}</a>\n\n`);
last_minutes = minutes[2];
}
return;
}
if(el.trim() == '') return;
let line = el.replace(/(<([^>]+)>)/gi, "");
storeArr.push(line.trim());
});
const finalArr = [];
let last_line = false;
storeArr.forEach((el, i) => {
// remove duplicate lines which are next to each other
if(el === last_line) return;
finalArr.push(el);
last_line = el;
});
let parsedOutput = finalArr.join(" ");
let mdOutput = `
# ${info.fulltitle}
[${info.webpage_url}](${info.webpage_url})
Uploaded by [${info.uploader}](${info.channel_url}) on ${info.date_for_display}.
${info.description.replace(/[ ]*\n/g, "<br />\n")}
---
[00:00:00](${info.webpage_url})
${parsedOutput}
---
**END**
`;
const slugify_conf = {
replacement: '-',
remove: '"\'',
lower: true,
strict: true,
locale: 'en'
};
let mdFilepath = `${OUTPUT_MD_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.md`;
fs.writeFile(`${OUTPUT_MD_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.md`, mdOutput.trim(), (err) => {
if (err) {
return console.log(err);
}
return err;
});
var options = {
remarkable: {
html: true,
breaks: false,
plugins: [ require('remarkable-classy') ]
}
}
console.log(mdFilepath);
if(fs.existsSync(mdFilepath)){
let pdfFilepath = `${OUTPUT_PDF_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.pdf`;
console.log('+ '+pdfFilepath);
markdownpdf(options).from.path(mdFilepath).to.path(pdfFilepath, function () {
console.log("Created", pdfFilepath)
});
}
}
const makeOhhFromVtt = function(vttFilepath){
}
/**
* MAIN
*/
let vttFiles = glob.sync(`${INPUT}/*.en.vtt`);
vttFiles.forEach((vttFilepath, i) => {
makePdfFromVtt(vttFilepath);
makeOhhFromVtt(vttFilepath);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment