Skip to content

Instantly share code, notes, and snippets.

@hubgit
Last active July 2, 2020 06:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hubgit/9398b4a0b627e37346ecbab1cbfdb84b to your computer and use it in GitHub Desktop.
Save hubgit/9398b4a0b627e37346ecbab1cbfdb84b to your computer and use it in GitHub Desktop.
Given a Project Muse book id, this will fetch each chapter and merge them into a single PDF.
const fetch = require("node-fetch");
const url = require("url");
const { Parser } = require("htmlparser2");
const hummus = require("hummus");
const BOOK_ID = "66229"; // EDIT THIS
const OUTPUT_FILE = `${BOOK_ID}.pdf`;
const extractURLs = async () => {
const baseURL = `https://muse.jhu.edu/book/${BOOK_ID}`;
const response = await fetch(baseURL);
if (!response.ok) {
throw new Error("No HTML");
}
const html = await response.text();
const links = [];
let currentLink = null;
const parser = new Parser(
{
onopentag: function(name, attribs) {
if (
name === "a" &&
attribs.href &&
attribs.href.match(/^\/chapter\/\d+(\/pdf)?$/)
) {
currentLink = attribs.href;
} else {
currentLink = null;
}
},
ontext: function(text) {
if (currentLink && text.match(/\s*Download\s*/)) {
links.push(currentLink);
}
},
onclosetag: function(tagname) {
if (tagname === "a") {
currentLink = null;
}
}
},
{ decodeEntities: true }
);
parser.write(html);
parser.end();
return links
.map(link => url.resolve(baseURL, link))
.map(link => link.replace(/(\/pdf)?$/, "/pdf"));
};
const buildPDF = async urls => {
const pdfWriter = hummus.createWriter(OUTPUT_FILE);
for (const url of urls) {
const response = await fetch(url);
if (!response.ok) {
throw new Error("No PDF");
}
const buffer = await response.buffer();
const inStream = new hummus.PDFRStreamForBuffer(buffer);
const context = pdfWriter.createPDFCopyingContext(inStream);
const pagesCount = context.getSourceDocumentParser().getPagesCount();
console.log(url, `${pagesCount} pages`);
if (pagesCount > 1) {
pdfWriter.appendPDFPagesFromPDF(new hummus.PDFRStreamForBuffer(buffer), {
type: hummus.eRangeTypeSpecific,
specificRanges: [[1, pagesCount - 1]]
});
}
}
pdfWriter.end();
};
extractURLs()
.then(buildPDF)
.then(() => {
console.log(`Written to ${OUTPUT_FILE}`);
})
.catch(error => {
console.error(error);
});
{
"name": "muse-merge",
"private": true,
"license": "MIT",
"dependencies": {
"htmlparser2": "^3.10.1",
"hummus": "^1.0.104",
"node-fetch": "^2.6.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment