Skip to content

Instantly share code, notes, and snippets.

@rbayliss
Created August 29, 2017 23:13
Show Gist options
  • Save rbayliss/a0af3bc05be247a78248280050c5b8c1 to your computer and use it in GitHub Desktop.
Save rbayliss/a0af3bc05be247a78248280050c5b8c1 to your computer and use it in GitHub Desktop.
const puppeteer = require('puppeteer');
const EventEmitter = require('events');
/**
* Usage:
*
* const crawler = new Crawler('http://example.com');
*
* crawler.on('insecureRequests', (e) => {
* console.log(`Caught insecure request on ${e.url} to ${e.requests}`);
* });
* crawler.crawl();
*
* To set custom headers (ex: Auth headers), use crawler.headers.
* To set the filter function for which URLs are included in the crawl,
* override crawler.isIncluded.
*/
class Crawler extends EventEmitter {
constructor(url) {
super();
this.headers = null;
this.isIncluded = urlIsIncluded(url);
this._browser = null;
this._queue = [url];
this._seen = new Set();
}
async crawl() {
this._browser = await puppeteer.launch({
timeout: 5000
});
await this.doCrawl();
return this._browser.close();
}
async doCrawl() {
var url;
while(url = this._queue.pop()) {
this._queueUrls(await this._crawlOne(url));
}
return Promise.resolve();
}
async _crawlOne(url) {
const page = await this._browser.newPage();
await page.setExtraHTTPHeaders(this.headers);
const insecureRequests = [];
page.on('request', (interceptedRequest) => {
if(interceptedRequest.url.indexOf('http:') === 0) {
insecureRequests.push(interceptedRequest.url);
}
});
return page.goto(url).then(async (response) => {
const urls = await page.evaluate(() => {
const links = document.querySelectorAll('a[href]');
return Array.prototype.map.call(links, u => {
return `${u.protocol}//${u.host}${u.pathname}${u.search}`;
});
})
return urls.filter(uniqueFilter()).filter(this.isIncluded);
}).catch(err => {
console.log('Caught: ' + err);
return Promise.resolve([]);
}).then(discoveredUrls => {
if(insecureRequests.length) {
this.emit('insecureRequests', {
source: url,
requests: insecureRequests.join("\n")
});
}
return discoveredUrls;
});
}
_queueUrls(urls) {
urls.forEach(u => {
if(!this._seen.has(u)) {
this._seen.add(u);
this._queue.push(u);
}
});
}
}
const urlIsIncluded = (base) => {
return (url) => {
return url.indexOf(base) === 0;
}
}
const uniqueFilter = () => {
var seen = {};
return function(element, index, array) {
return !(element in seen) && (seen[element] = 1);
};
}
module.exports = Crawler;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment