dmgig/README.md

## README.md

      
    Raw
  

              README.md
            
          
    YouTube Subtitles to Pdf

Pull machine-generated transcript files from YouTube, convert them into something more readable, and then save as markdown and pdf files.
Example output: Witness to War YouTube Channel Transcripts.
Pull subtitles, annotations, and description from YouTube using youtube-dl.

You can pull a single video, a playlist, or an entire channel. If doing a channel, I find that using the video tab (producing /videos at the end of the channel url) works best.
youtube-dl \
  --skip-download # don't download the actual video \ 
  --write-auto-sub # grab the subtitles, will produce the .vtt file \
  --write-info-json # grab the metadata into .info.json file \
  --write-description # write video description to .description file \
  --output "_%(title)s.%(ext)s" # output location, filename template \
  --ignore-errors \
  https://www.youtube.com/channel/UC7X0LKts7k4FT498Id_x5sg/videos 

Parse vtt files, and produce markdown and pdfs

node \
subs2text.js \
/youtube/input/global


## subs2text.js
#!/usr/bin/env node

const fs = require('fs');
const yaml = require('js-yaml');
const path = require('path');
const padStart = require('string.prototype.padstart');
const slugify = require('slugify')
const glob = require("glob");
const { exec } = require("child_process");
var markdownpdf = require("markdown-pdf")

padStart.shim();

const INPUT = process.argv[2].replace(/\/$/, '');

const OUTPUT_OHH_DIR = INPUT.replace('input/', 'output/');
if(!fs.existsSync(OUTPUT_OHH_DIR)){
	fs.mkdirSync(OUTPUT_OHH_DIR);
}

const OUTPUT_MD_DIR = INPUT.replace('input/', 'md/');
if(!fs.existsSync(OUTPUT_MD_DIR)){
	fs.mkdirSync(OUTPUT_MD_DIR);
}

const OUTPUT_PDF_DIR = INPUT.replace('input/', 'pdf/');
if(!fs.existsSync(OUTPUT_PDF_DIR)){
	fs.mkdirSync(OUTPUT_PDF_DIR);
}

/**
 * WORK
 */
const makePdfFromVtt = function(vttFilepath){

	const descriptionPath = vttFilepath.replace(/\.en\.vtt$/, '.description');
	const info_jsonPath = vttFilepath.replace(/\.en\.vtt$/, '.info.json');

	let vttData = fs.readFileSync(vttFilepath, 'utf8');

	let description = '';
	if(fs.existsSync(descriptionPath)){
		description = fs.readFileSync(descriptionPath, 'utf8');
	}

	let info_json = null;
	if(fs.existsSync(info_jsonPath)){
		info_json = fs.readFileSync(info_jsonPath, 'utf8');
		info_json = JSON.parse(info_json);
	}

	let d = info_json.upload_date.split('');
	let date_for_file = d[0]+d[1]+d[2]+d[3]+'-'+d[4]+d[5]+'-'+d[6]+d[7];
	let date_for_display = new Date(d[0]+d[1]+d[2]+d[3], d[4]+d[5], d[6]+d[7]).toDateString();

	let info = {
		upload_date: info_json.upload_date,
		date_for_display: date_for_display,
		fulltitle: info_json.fulltitle,
		tags: info_json.tags,
		id: info_json.id,
		uploader: info_json.uploader,
		channel_url: info_json.channel_url,
		description: info_json.description,
		webpage_url: info_json.webpage_url,
		date: date_for_file
	}

	const storeArr = [];
	let last_minutes = false;

	const lines = vttData.split('\n');
	lines.forEach((el, i) => {
		// remove first 3 lines
		if(i <= 2) return;
		if(el.includes('-->')){
				let matches = el.match(/([0-9]{2}\:[0-9]{2}\:[0-9]{2}\.[0-9]{3}) --> ([0-9]{2}\:[0-9]{2}\:[0-9]{2}\.[0-9]{3})/);
				let minutes = matches[1].match(/([0-9]{2})\:([0-9]{2})\:([0-9]{2})\.[0-9]{3}/);

				if(minutes[2] % 2 == 0 && minutes[2] != last_minutes){
					let timestamp = minutes[1]+':'+minutes[2]+':'+minutes[3];
					let seconds = (parseInt(minutes[1]) * 3600) + (parseInt(minutes[2]) * 60) + parseInt(minutes[3]);
				  storeArr.push(`\n\n<a href="${info.webpage_url}&t=${seconds}s">${timestamp}</a>\n\n`);
					last_minutes = minutes[2];
				}
				return;
		}
		if(el.trim() == '') return;

		let line = el.replace(/(<([^>]+)>)/gi, "");

		storeArr.push(line.trim());

	});

	const finalArr = [];
	let last_line = false;
	storeArr.forEach((el, i) => {
		// remove duplicate lines which are next to each other
		if(el === last_line) return;
		finalArr.push(el);
		last_line = el;
	});

	let parsedOutput = finalArr.join(" ");

	let mdOutput = `

# ${info.fulltitle}

[${info.webpage_url}](${info.webpage_url})

Uploaded by [${info.uploader}](${info.channel_url}) on ${info.date_for_display}.

${info.description.replace(/[ ]*\n/g, "<br />\n")}

---

[00:00:00](${info.webpage_url})

${parsedOutput}

---

**END**

`;

	const slugify_conf = {
	  replacement: '-',
	  remove: '"\'',
	  lower: true,
	  strict: true,
	  locale: 'en'
	};

	let mdFilepath = `${OUTPUT_MD_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.md`;

	fs.writeFile(`${OUTPUT_MD_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.md`, mdOutput.trim(), (err) => {
	  if (err) {
	    return console.log(err);
	  }
	  return err;
	});


	var options = {
	    remarkable: {
	        html: true,
	        breaks: false,
	        plugins: [ require('remarkable-classy') ]
	    }
	}

	console.log(mdFilepath);
	if(fs.existsSync(mdFilepath)){
		let pdfFilepath = `${OUTPUT_PDF_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.pdf`;

		console.log('+ '+pdfFilepath);

		markdownpdf(options).from.path(mdFilepath).to.path(pdfFilepath, function () {
		  console.log("Created", pdfFilepath)
		});
	}

}

const makeOhhFromVtt = function(vttFilepath){

}


/**
 * MAIN
 */

 let vttFiles = glob.sync(`${INPUT}/*.en.vtt`);

 vttFiles.forEach((vttFilepath, i) => {

 	makePdfFromVtt(vttFilepath);
  makeOhhFromVtt(vttFilepath);

 });
	#!/usr/bin/env node

	const fs = require('fs');
	const yaml = require('js-yaml');
	const path = require('path');
	const padStart = require('string.prototype.padstart');
	const slugify = require('slugify')
	const glob = require("glob");
	const { exec } = require("child_process");
	var markdownpdf = require("markdown-pdf")

	padStart.shim();

	const INPUT = process.argv[2].replace(/\/$/, '');

	const OUTPUT_OHH_DIR = INPUT.replace('input/', 'output/');
	if(!fs.existsSync(OUTPUT_OHH_DIR)){
	fs.mkdirSync(OUTPUT_OHH_DIR);
	}

	const OUTPUT_MD_DIR = INPUT.replace('input/', 'md/');
	if(!fs.existsSync(OUTPUT_MD_DIR)){
	fs.mkdirSync(OUTPUT_MD_DIR);
	}

	const OUTPUT_PDF_DIR = INPUT.replace('input/', 'pdf/');
	if(!fs.existsSync(OUTPUT_PDF_DIR)){
	fs.mkdirSync(OUTPUT_PDF_DIR);
	}

	/**
	* WORK
	*/
	const makePdfFromVtt = function(vttFilepath){

	const descriptionPath = vttFilepath.replace(/\.en\.vtt$/, '.description');
	const info_jsonPath = vttFilepath.replace(/\.en\.vtt$/, '.info.json');

	let vttData = fs.readFileSync(vttFilepath, 'utf8');

	let description = '';
	if(fs.existsSync(descriptionPath)){
	description = fs.readFileSync(descriptionPath, 'utf8');
	}

	let info_json = null;
	if(fs.existsSync(info_jsonPath)){
	info_json = fs.readFileSync(info_jsonPath, 'utf8');
	info_json = JSON.parse(info_json);
	}

	let d = info_json.upload_date.split('');
	let date_for_file = d[0]+d[1]+d[2]+d[3]+'-'+d[4]+d[5]+'-'+d[6]+d[7];
	let date_for_display = new Date(d[0]+d[1]+d[2]+d[3], d[4]+d[5], d[6]+d[7]).toDateString();

	let info = {
	upload_date: info_json.upload_date,
	date_for_display: date_for_display,
	fulltitle: info_json.fulltitle,
	tags: info_json.tags,
	id: info_json.id,
	uploader: info_json.uploader,
	channel_url: info_json.channel_url,
	description: info_json.description,
	webpage_url: info_json.webpage_url,
	date: date_for_file
	}

	const storeArr = [];
	let last_minutes = false;

	const lines = vttData.split('\n');
	lines.forEach((el, i) => {
	// remove first 3 lines
	if(i <= 2) return;
	if(el.includes('-->')){
	let matches = el.match(/([0-9]{2}\:[0-9]{2}\:[0-9]{2}\.[0-9]{3}) --> ([0-9]{2}\:[0-9]{2}\:[0-9]{2}\.[0-9]{3})/);
	let minutes = matches[1].match(/([0-9]{2})\:([0-9]{2})\:([0-9]{2})\.[0-9]{3}/);

	if(minutes[2] % 2 == 0 && minutes[2] != last_minutes){
	let timestamp = minutes[1]+':'+minutes[2]+':'+minutes[3];
	let seconds = (parseInt(minutes[1]) * 3600) + (parseInt(minutes[2]) * 60) + parseInt(minutes[3]);
	storeArr.push(`\n\n<a href="${info.webpage_url}&t=${seconds}s">${timestamp}</a>\n\n`);
	last_minutes = minutes[2];
	}
	return;
	}
	if(el.trim() == '') return;

	let line = el.replace(/(<([^>]+)>)/gi, "");

	storeArr.push(line.trim());

	});

	const finalArr = [];
	let last_line = false;
	storeArr.forEach((el, i) => {
	// remove duplicate lines which are next to each other
	if(el === last_line) return;
	finalArr.push(el);
	last_line = el;
	});

	let parsedOutput = finalArr.join(" ");

	let mdOutput = `

	# ${info.fulltitle}

	[${info.webpage_url}](${info.webpage_url})

	Uploaded by [${info.uploader}](${info.channel_url}) on ${info.date_for_display}.

	${info.description.replace(/[ ]*\n/g, "<br />\n")}

	---

	[00:00:00](${info.webpage_url})

	${parsedOutput}

	---

	END

	`;

	const slugify_conf = {
	replacement: '-',
	remove: '"\'',
	lower: true,
	strict: true,
	locale: 'en'
	};

	let mdFilepath = `${OUTPUT_MD_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.md`;

	fs.writeFile(`${OUTPUT_MD_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.md`, mdOutput.trim(), (err) => {
	if (err) {
	return console.log(err);
	}
	return err;
	});


	var options = {
	remarkable: {
	html: true,
	breaks: false,
	plugins: [ require('remarkable-classy') ]
	}
	}

	console.log(mdFilepath);
	if(fs.existsSync(mdFilepath)){
	let pdfFilepath = `${OUTPUT_PDF_DIR}/${date_for_file}-${slugify(info.fulltitle, slugify_conf)}.pdf`;

	console.log('+ '+pdfFilepath);

	markdownpdf(options).from.path(mdFilepath).to.path(pdfFilepath, function () {
	console.log("Created", pdfFilepath)
	});
	}

	}

	const makeOhhFromVtt = function(vttFilepath){

	}


	/**
	* MAIN
	*/

	let vttFiles = glob.sync(`${INPUT}/*.en.vtt`);

	vttFiles.forEach((vttFilepath, i) => {

	makePdfFromVtt(vttFilepath);
	makeOhhFromVtt(vttFilepath);

	});