Skip to content

Instantly share code, notes, and snippets.

@ephbaum
Created June 1, 2023 04:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ephbaum/199e7259b021fc21df49519c862e36d3 to your computer and use it in GitHub Desktop.
Save ephbaum/199e7259b021fc21df49519c862e36d3 to your computer and use it in GitHub Desktop.
EPUB to TXT via NodeJS
'use strict'
var fs = require('fs');
var EPub = require('epub');
var htmlToText = require('html-to-text');
var path = require('path');
var htmlParser = require('node-html-parser');
class EPUBToText {
extract(sourceFile, callback, initialCallback) {
var epub = new EPub(sourceFile);
var klass = this;
epub.on('end', function() {
epub.flow.forEach(function(chapter, sequence) {
epub.getChapter(chapter.id, function(err, html) {
var txt = '';
if (html) {
txt = htmlToText.fromString(html.toString(), {ignoreHref: true});
};
var meta = {};
meta.id = chapter.id;
meta.excerpt = txt.trim().slice(0, 250);
meta.size = txt.length
meta.sequence_number = sequence
if (chapter.title) {
meta.title = chapter.title
} else {
meta.title = klass.getTitleFromHtml(html);
}
callback(err, txt, sequence, meta);
});
});
});
epub.on('end', function() {
if (initialCallback) {
initialCallback(null, epub.flow.length);
};
});
epub.parse();
}
extractTo(sourceFile, destFolder, callback) {
var totalCount;
var processedCount = 0;
this.extract(sourceFile, (err, txt, sequence) => {
var destFile = destFolder + '/' + sequence + '-' + path.basename(sourceFile) + '.txt'
fs.writeFileSync(destFile, txt);
processedCount += 1;
if (processedCount >= totalCount) {
callback(null);
}
}, (err, numberOfChapters) => {
totalCount = numberOfChapters
});
}
getTitleFromHtml(html) {
const root = htmlParser.parse(html);
var title = root.querySelector('h1');
if (title == null) {
title = root.querySelector('title');
if (title == null) {
return '';
};
};
return title.structuredText.replace("\n", " ");
}
}
module.exports = EPUBToText;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment