Skip to content

Instantly share code, notes, and snippets.

@SmithKevin
Created December 7, 2018 22:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SmithKevin/24a82730c6ec926ab218ca9ab83e1a75 to your computer and use it in GitHub Desktop.
Save SmithKevin/24a82730c6ec926ab218ca9ab83e1a75 to your computer and use it in GitHub Desktop.
Updated version of browser.js for use with puppeteer-core
const chromium = require('chrome-aws-lambda');
const puppeteer = require('puppeteer-core');
module.exports.open = async (url) => {
let page;
let browser;
try {
let myArgs = chromium.args;
//disable caching just in case we decide to reuse browser.
myArgs.push('--aggressive-cache-discard','--disable-cache', '--disable-application-cache');
browser = await puppeteer.launch({
args: myArgs,
executablePath: await chromium.executablePath,
headless: true,
ignoreHTTPSErrors: true,
});
page = await browser.newPage();
}catch (error){
console.log("Browser Error: " + error.name + " " + error.message);
return {error: err};
}
const hrstart = process.hrtime();
let tags = [];
let imageAborted = new Set();
page.on('requestfinished', request => {
let resp = request.response();
if (resp && (resp.url().indexOf('pixel.quantserve.com/pixel') > -1 || resp.url().indexOf('pixel.quantcount.com/pixel') > -1 )) {
tags.push({tag: resp.url(), code: resp.status()});
}
});
// images are expensive to download and slow to load. Skip them (except the QC pixel image of course)
// if we tried to get the same image we have ignored, then fine let it download. This sometimes prevents a page from loading
page.on('request', request => {
if( request ) {
if (request.resourceType() === 'image' && !imageAborted.has(request.url()) &&
request.url().indexOf('pixel.quantserve.com/pixel') === -1 && request.url().indexOf('pixel.quantcount.com/pixel') === -1) {
request.abort();
imageAborted.add(request.url());
} else {
request.continue();
}
}
});
let result;
try {
await page.setCacheEnabled(false);
await page.setRequestInterception(true);
// we have found some sites that don't like the default puppeteer headers, so add these
await page.setExtraHTTPHeaders({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 Quantcast/1.0',
'Accept-Language': '*'
});
const resp = await page.goto(url, {waitUntil: ['load','networkidle2']});
// sometimes response is null. Puppeteer bug: https://github.com/GoogleChrome/puppeteer/issues/1391
if ( resp ) {
console.log("response " + resp.status() + " re " + url);
if (resp.status() > 399) {
result = {error: "Error in scan " + url + " code: " + resp.status(), code:resp.status()};
} else {
result = {tags: tags, code: resp.status()};
}
} else if (tags.length > 0) {
result = {tags: tags, code: 200};
} else {
result = {error: "Response null", code:0};
}
} catch (err) {
// tags ended up loading, but page technically did not load successfully
console.log("Goto Error: " + err.name + " | " + err.message);
if(err.message.indexOf("Timeout") > -1) {
//wonky way custom Datadog metrics work on lambda
console.log("Page load timeout");
}
if(tags.length > 0){
result = {tags: tags, code: -1};
}else {
//its an error, but still a successful scan
result = {error: err.message, code: 0};
}
}
await page.close();
await browser.close();
const hrend = process.hrtime(hrstart);
const timeMillis = (hrend[0] * 1000) + (hrend[1]/1000000);
console.log("Close Page " + timeMillis);
return result;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment