Last active
July 2, 2020 06:33
-
-
Save hubgit/9398b4a0b627e37346ecbab1cbfdb84b to your computer and use it in GitHub Desktop.
Given a Project Muse book id, this will fetch each chapter and merge them into a single PDF.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fetch = require("node-fetch"); | |
const url = require("url"); | |
const { Parser } = require("htmlparser2"); | |
const hummus = require("hummus"); | |
const BOOK_ID = "66229"; // EDIT THIS | |
const OUTPUT_FILE = `${BOOK_ID}.pdf`; | |
const extractURLs = async () => { | |
const baseURL = `https://muse.jhu.edu/book/${BOOK_ID}`; | |
const response = await fetch(baseURL); | |
if (!response.ok) { | |
throw new Error("No HTML"); | |
} | |
const html = await response.text(); | |
const links = []; | |
let currentLink = null; | |
const parser = new Parser( | |
{ | |
onopentag: function(name, attribs) { | |
if ( | |
name === "a" && | |
attribs.href && | |
attribs.href.match(/^\/chapter\/\d+(\/pdf)?$/) | |
) { | |
currentLink = attribs.href; | |
} else { | |
currentLink = null; | |
} | |
}, | |
ontext: function(text) { | |
if (currentLink && text.match(/\s*Download\s*/)) { | |
links.push(currentLink); | |
} | |
}, | |
onclosetag: function(tagname) { | |
if (tagname === "a") { | |
currentLink = null; | |
} | |
} | |
}, | |
{ decodeEntities: true } | |
); | |
parser.write(html); | |
parser.end(); | |
return links | |
.map(link => url.resolve(baseURL, link)) | |
.map(link => link.replace(/(\/pdf)?$/, "/pdf")); | |
}; | |
const buildPDF = async urls => { | |
const pdfWriter = hummus.createWriter(OUTPUT_FILE); | |
for (const url of urls) { | |
const response = await fetch(url); | |
if (!response.ok) { | |
throw new Error("No PDF"); | |
} | |
const buffer = await response.buffer(); | |
const inStream = new hummus.PDFRStreamForBuffer(buffer); | |
const context = pdfWriter.createPDFCopyingContext(inStream); | |
const pagesCount = context.getSourceDocumentParser().getPagesCount(); | |
console.log(url, `${pagesCount} pages`); | |
if (pagesCount > 1) { | |
pdfWriter.appendPDFPagesFromPDF(new hummus.PDFRStreamForBuffer(buffer), { | |
type: hummus.eRangeTypeSpecific, | |
specificRanges: [[1, pagesCount - 1]] | |
}); | |
} | |
} | |
pdfWriter.end(); | |
}; | |
extractURLs() | |
.then(buildPDF) | |
.then(() => { | |
console.log(`Written to ${OUTPUT_FILE}`); | |
}) | |
.catch(error => { | |
console.error(error); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "muse-merge", | |
"private": true, | |
"license": "MIT", | |
"dependencies": { | |
"htmlparser2": "^3.10.1", | |
"hummus": "^1.0.104", | |
"node-fetch": "^2.6.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment