Created
August 29, 2017 23:13
-
-
Save rbayliss/a0af3bc05be247a78248280050c5b8c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const EventEmitter = require('events'); | |
/** | |
* Usage: | |
* | |
* const crawler = new Crawler('http://example.com'); | |
* | |
* crawler.on('insecureRequests', (e) => { | |
* console.log(`Caught insecure request on ${e.url} to ${e.requests}`); | |
* }); | |
* crawler.crawl(); | |
* | |
* To set custom headers (ex: Auth headers), use crawler.headers. | |
* To set the filter function for which URLs are included in the crawl, | |
* override crawler.isIncluded. | |
*/ | |
class Crawler extends EventEmitter { | |
constructor(url) { | |
super(); | |
this.headers = null; | |
this.isIncluded = urlIsIncluded(url); | |
this._browser = null; | |
this._queue = [url]; | |
this._seen = new Set(); | |
} | |
async crawl() { | |
this._browser = await puppeteer.launch({ | |
timeout: 5000 | |
}); | |
await this.doCrawl(); | |
return this._browser.close(); | |
} | |
async doCrawl() { | |
var url; | |
while(url = this._queue.pop()) { | |
this._queueUrls(await this._crawlOne(url)); | |
} | |
return Promise.resolve(); | |
} | |
async _crawlOne(url) { | |
const page = await this._browser.newPage(); | |
await page.setExtraHTTPHeaders(this.headers); | |
const insecureRequests = []; | |
page.on('request', (interceptedRequest) => { | |
if(interceptedRequest.url.indexOf('http:') === 0) { | |
insecureRequests.push(interceptedRequest.url); | |
} | |
}); | |
return page.goto(url).then(async (response) => { | |
const urls = await page.evaluate(() => { | |
const links = document.querySelectorAll('a[href]'); | |
return Array.prototype.map.call(links, u => { | |
return `${u.protocol}//${u.host}${u.pathname}${u.search}`; | |
}); | |
}) | |
return urls.filter(uniqueFilter()).filter(this.isIncluded); | |
}).catch(err => { | |
console.log('Caught: ' + err); | |
return Promise.resolve([]); | |
}).then(discoveredUrls => { | |
if(insecureRequests.length) { | |
this.emit('insecureRequests', { | |
source: url, | |
requests: insecureRequests.join("\n") | |
}); | |
} | |
return discoveredUrls; | |
}); | |
} | |
_queueUrls(urls) { | |
urls.forEach(u => { | |
if(!this._seen.has(u)) { | |
this._seen.add(u); | |
this._queue.push(u); | |
} | |
}); | |
} | |
} | |
const urlIsIncluded = (base) => { | |
return (url) => { | |
return url.indexOf(base) === 0; | |
} | |
} | |
const uniqueFilter = () => { | |
var seen = {}; | |
return function(element, index, array) { | |
return !(element in seen) && (seen[element] = 1); | |
}; | |
} | |
module.exports = Crawler; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment