Created
December 7, 2018 22:35
-
-
Save SmithKevin/24a82730c6ec926ab218ca9ab83e1a75 to your computer and use it in GitHub Desktop.
Updated version of browser.js for use with puppeteer-core
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const chromium = require('chrome-aws-lambda'); | |
const puppeteer = require('puppeteer-core'); | |
module.exports.open = async (url) => { | |
let page; | |
let browser; | |
try { | |
let myArgs = chromium.args; | |
//disable caching just in case we decide to reuse browser. | |
myArgs.push('--aggressive-cache-discard','--disable-cache', '--disable-application-cache'); | |
browser = await puppeteer.launch({ | |
args: myArgs, | |
executablePath: await chromium.executablePath, | |
headless: true, | |
ignoreHTTPSErrors: true, | |
}); | |
page = await browser.newPage(); | |
}catch (error){ | |
console.log("Browser Error: " + error.name + " " + error.message); | |
return {error: err}; | |
} | |
const hrstart = process.hrtime(); | |
let tags = []; | |
let imageAborted = new Set(); | |
page.on('requestfinished', request => { | |
let resp = request.response(); | |
if (resp && (resp.url().indexOf('pixel.quantserve.com/pixel') > -1 || resp.url().indexOf('pixel.quantcount.com/pixel') > -1 )) { | |
tags.push({tag: resp.url(), code: resp.status()}); | |
} | |
}); | |
// images are expensive to download and slow to load. Skip them (except the QC pixel image of course) | |
// if we tried to get the same image we have ignored, then fine let it download. This sometimes prevents a page from loading | |
page.on('request', request => { | |
if( request ) { | |
if (request.resourceType() === 'image' && !imageAborted.has(request.url()) && | |
request.url().indexOf('pixel.quantserve.com/pixel') === -1 && request.url().indexOf('pixel.quantcount.com/pixel') === -1) { | |
request.abort(); | |
imageAborted.add(request.url()); | |
} else { | |
request.continue(); | |
} | |
} | |
}); | |
let result; | |
try { | |
await page.setCacheEnabled(false); | |
await page.setRequestInterception(true); | |
// we have found some sites that don't like the default puppeteer headers, so add these | |
await page.setExtraHTTPHeaders({ | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 Quantcast/1.0', | |
'Accept-Language': '*' | |
}); | |
const resp = await page.goto(url, {waitUntil: ['load','networkidle2']}); | |
// sometimes response is null. Puppeteer bug: https://github.com/GoogleChrome/puppeteer/issues/1391 | |
if ( resp ) { | |
console.log("response " + resp.status() + " re " + url); | |
if (resp.status() > 399) { | |
result = {error: "Error in scan " + url + " code: " + resp.status(), code:resp.status()}; | |
} else { | |
result = {tags: tags, code: resp.status()}; | |
} | |
} else if (tags.length > 0) { | |
result = {tags: tags, code: 200}; | |
} else { | |
result = {error: "Response null", code:0}; | |
} | |
} catch (err) { | |
// tags ended up loading, but page technically did not load successfully | |
console.log("Goto Error: " + err.name + " | " + err.message); | |
if(err.message.indexOf("Timeout") > -1) { | |
//wonky way custom Datadog metrics work on lambda | |
console.log("Page load timeout"); | |
} | |
if(tags.length > 0){ | |
result = {tags: tags, code: -1}; | |
}else { | |
//its an error, but still a successful scan | |
result = {error: err.message, code: 0}; | |
} | |
} | |
await page.close(); | |
await browser.close(); | |
const hrend = process.hrtime(hrstart); | |
const timeMillis = (hrend[0] * 1000) + (hrend[1]/1000000); | |
console.log("Close Page " + timeMillis); | |
return result; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment