Skip to content

Instantly share code, notes, and snippets.

@sloonz
Last active June 10, 2021 19:33
Show Gist options
  • Save sloonz/59428ec747e0f36dd81ac5fc47b242f2 to your computer and use it in GitHub Desktop.
Save sloonz/59428ec747e0f36dd81ac5fc47b242f2 to your computer and use it in GitHub Desktop.
Create epub from a list of URLs, using mozilla/readability to extract just the text
#!/usr/bin/node
// Install dependencies: npm install @mozilla/readability jsdom got argparse
// Usage: node mkebook.js --title Test url1 url2 > test.epub
const fs = require('fs').promises;
const { spawn } = require('child_process');
const { JSDOM } = require('jsdom');
const { Readability } = require('@mozilla/readability');
const got = require("got");
const { ArgumentParser } = require('argparse');
const parser = new ArgumentParser();
parser.add_argument('-t', '--title', { required: true });
parser.add_argument('urls', { nargs: '*' });
async function main(args) {
const { title, urls } = args;
const files = await Promise.all(urls.map(async (url, i) => {
const filename = (i + 1).toString().padStart(3, '0') + '.html';
const { body } = await got(url);
const { title, content } = new Readability((new JSDOM(body)).window.document).parse();
const doc = new JSDOM('<html><head><title></title></head><body><h1></h1><div></div></body></html>');
doc.window.document.querySelector('title').textContent = title;
doc.window.document.querySelector('h1').textContent = title;
doc.window.document.querySelector('div').innerHTML = content;
await fs.writeFile(filename, doc.serialize());
return filename;
}));
await new Promise((resolve, reject) => {
spawn('pandoc', [`-Mtitle=${title}`, '-t', 'epub', '-o', '-', ...files], { stdio: 'inherit' })
.on('error', reject)
.on('exit', resolve);
});
await Promise.all(files.map(file => fs.unlink(file)));
}
main(parser.parse_args()).catch(console.error);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment