Skip to content

Instantly share code, notes, and snippets.

@MrOrz
Created March 19, 2018 16:42
  • Star 6 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save MrOrz/fb48f27f0f21846d0df521728fda19ce to your computer and use it in GitHub Desktop.
Test web page summarization with Mozilla/Readability.js and puppeteer
const DOC_URL = process.argv[2];
const puppeteer = require('puppeteer');
const fs = require('fs');
const readabilityJsStr = fs.readFileSync('node_modules/readability/Readability.js', {encoding: 'utf-8'})
function executor() {
return new Readability({}, document).parse();
}
async function main(){
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.setDefaultNavigationTimeout(5000);
console.log('Loading', DOC_URL);
try {
await page.goto(DOC_URL);
} catch(e) {
console.error(e);
}
console.log('Extracting...');
const canonicalUrl = await page.evaluate(() => {
const canonicalLink = document.querySelector('link[rel=canonical]');
if(canonicalLink) return canonicalLink.href;
const ogUrlMeta = document.querySelector('meta[property="og:url"]');
if(ogUrlMeta) return ogUrlMeta.content;
return window.location.href;
});
const resultArticle = await page.evaluate(`
(function(){
${readabilityJsStr}
${executor}
return executor();
}())
`);
const imageUrl = await page.evaluate(contentHTML => {
const ogImageMeta = document.querySelector('meta[property="og:image"], meta[property="og:image:url"]');
if(ogImageMeta) return ogImageMeta.content;
const containerDiv = document.createElement('div');
containerDiv.innerHTML = contentHTML;
const contentImgs = Array.from(containerDiv.querySelectorAll('img'));
if(contentImgs.length === 0) return '';
const largestImg = contentImgs.slice(1).reduce((largestImage, img) => {
return largestImage.width * largestImage.height >= img.width * img.height ? largestImage : img
}, contentImgs[0]);
// src may be relative URL, resolve with current location.
return new URL(largestImg.src, location.href).href;
}, resultArticle.content);
browser.close();
console.log('=====');
console.log(canonicalUrl);
console.log('=====');
console.log(resultArticle.title)
console.log('=====');
console.log(resultArticle.textContent.trim())
console.log('=====');
console.log(imageUrl);
console.log('=====');
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment