SmithKevin/browser2.js

## browser2.js
const chromium = require('chrome-aws-lambda');
const puppeteer = require('puppeteer-core');

module.exports.open = async (url) => {
    let page;
    let browser;
    try {
        let myArgs = chromium.args;
        //disable caching just in case we decide to reuse browser.
        myArgs.push('--aggressive-cache-discard','--disable-cache', '--disable-application-cache');
        browser = await puppeteer.launch({
            args: myArgs,
            executablePath: await chromium.executablePath,
            headless: true,
            ignoreHTTPSErrors: true,
        });
        page = await browser.newPage();
    }catch (error){
        console.log("Browser Error: " + error.name + " " + error.message);
        return {error: err};
    }
    const hrstart = process.hrtime();
    let tags = [];
    let imageAborted = new Set();
    page.on('requestfinished', request => {
        let resp = request.response();
        if (resp && (resp.url().indexOf('pixel.quantserve.com/pixel') > -1 || resp.url().indexOf('pixel.quantcount.com/pixel') > -1 )) {
        tags.push({tag: resp.url(), code: resp.status()});
        }
    });
    // images are expensive to download and slow to load.   Skip them (except the QC pixel image of course)
    // if we tried to get the same image we have ignored, then fine let it download.  This sometimes prevents a page from loading
    page.on('request', request => {
        if( request ) {
            if (request.resourceType() === 'image' && !imageAborted.has(request.url()) &&
                request.url().indexOf('pixel.quantserve.com/pixel') === -1 && request.url().indexOf('pixel.quantcount.com/pixel') === -1)  {
                 request.abort();
                 imageAborted.add(request.url());
            } else {
                request.continue();
            }
        }
    });

    let result;
    try {
        await page.setCacheEnabled(false);
        await page.setRequestInterception(true);
        // we have found some sites that don't like the default puppeteer headers, so add these
        await page.setExtraHTTPHeaders({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 Quantcast/1.0',
            'Accept-Language': '*'
        });
        const resp = await page.goto(url, {waitUntil: ['load','networkidle2']});
        // sometimes response is null.  Puppeteer bug: https://github.com/GoogleChrome/puppeteer/issues/1391
        if ( resp ) {
            console.log("response " + resp.status() + " re " + url);
            if (resp.status() > 399) {
                result = {error: "Error in scan " + url + " code: " + resp.status(), code:resp.status()};
            } else {
                result = {tags: tags, code: resp.status()};
            }
        } else if (tags.length > 0) {
            result = {tags: tags, code: 200};
        } else {
            result = {error: "Response null", code:0};
        }
    } catch (err) {
        // tags ended up loading, but page technically did not load successfully
        console.log("Goto Error: " + err.name + " | " + err.message);
        if(err.message.indexOf("Timeout") > -1) {
            //wonky way custom Datadog metrics work on lambda
            console.log("Page load timeout");
        }
        if(tags.length > 0){
            result = {tags: tags, code: -1};
        }else {
            //its an error, but still a successful scan
            result = {error: err.message, code: 0};
        }
    }
    await page.close();
    await browser.close();
    const hrend = process.hrtime(hrstart);
    const timeMillis = (hrend[0] * 1000) + (hrend[1]/1000000);
    console.log("Close Page " + timeMillis);
    return result;
};
	const chromium = require('chrome-aws-lambda');
	const puppeteer = require('puppeteer-core');

	module.exports.open = async (url) => {
	let page;
	let browser;
	try {
	let myArgs = chromium.args;
	//disable caching just in case we decide to reuse browser.
	myArgs.push('--aggressive-cache-discard','--disable-cache', '--disable-application-cache');
	browser = await puppeteer.launch({
	args: myArgs,
	executablePath: await chromium.executablePath,
	headless: true,
	ignoreHTTPSErrors: true,
	});
	page = await browser.newPage();
	}catch (error){
	console.log("Browser Error: " + error.name + " " + error.message);
	return {error: err};
	}
	const hrstart = process.hrtime();
	let tags = [];
	let imageAborted = new Set();
	page.on('requestfinished', request => {
	let resp = request.response();
	if (resp && (resp.url().indexOf('pixel.quantserve.com/pixel') > -1 \|\| resp.url().indexOf('pixel.quantcount.com/pixel') > -1 )) {
	tags.push({tag: resp.url(), code: resp.status()});
	}
	});
	// images are expensive to download and slow to load. Skip them (except the QC pixel image of course)
	// if we tried to get the same image we have ignored, then fine let it download. This sometimes prevents a page from loading
	page.on('request', request => {
	if( request ) {
	if (request.resourceType() === 'image' && !imageAborted.has(request.url()) &&
	request.url().indexOf('pixel.quantserve.com/pixel') === -1 && request.url().indexOf('pixel.quantcount.com/pixel') === -1) {
	request.abort();
	imageAborted.add(request.url());
	} else {
	request.continue();
	}
	}
	});

	let result;
	try {
	await page.setCacheEnabled(false);
	await page.setRequestInterception(true);
	// we have found some sites that don't like the default puppeteer headers, so add these
	await page.setExtraHTTPHeaders({
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 Quantcast/1.0',
	'Accept-Language': '*'
	});
	const resp = await page.goto(url, {waitUntil: ['load','networkidle2']});
	// sometimes response is null. Puppeteer bug: https://github.com/GoogleChrome/puppeteer/issues/1391
	if ( resp ) {
	console.log("response " + resp.status() + " re " + url);
	if (resp.status() > 399) {
	result = {error: "Error in scan " + url + " code: " + resp.status(), code:resp.status()};
	} else {
	result = {tags: tags, code: resp.status()};
	}
	} else if (tags.length > 0) {
	result = {tags: tags, code: 200};
	} else {
	result = {error: "Response null", code:0};
	}
	} catch (err) {
	// tags ended up loading, but page technically did not load successfully
	console.log("Goto Error: " + err.name + " \| " + err.message);
	if(err.message.indexOf("Timeout") > -1) {
	//wonky way custom Datadog metrics work on lambda
	console.log("Page load timeout");
	}
	if(tags.length > 0){
	result = {tags: tags, code: -1};
	}else {
	//its an error, but still a successful scan
	result = {error: err.message, code: 0};
	}
	}
	await page.close();
	await browser.close();
	const hrend = process.hrtime(hrstart);
	const timeMillis = (hrend[0] * 1000) + (hrend[1]/1000000);
	console.log("Close Page " + timeMillis);
	return result;
	};