Last active
July 8, 2019 20:20
-
-
Save arall/51a3d19bcbafd1db87e8053c18906307 to your computer and use it in GitHub Desktop.
NodeJS Headless Chrome Crawler Traffic Interceptor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const HCCrawler = require('headless-chrome-crawler'); | |
const url = require('url'); | |
const args = process.argv.slice(2); | |
var requests = []; | |
if(args[0] === undefined || args[1] === undefined){ | |
console.log('node crawler.js <url> <depth>'); | |
process.exit(); | |
} | |
(async () => { | |
const crawler = await HCCrawler.launch({ | |
maxDepth: args[1], | |
customCrawl: async (page, crawl) => { | |
// You can access the page object before requests | |
await page.setRequestInterception(true); | |
page.on('request', request => { | |
requests.push(request.url()); | |
// console.log(request.url()); | |
request.continue(); | |
// if (['image', 'stylesheet', 'media', 'font'].includes(request.resourceType())) { | |
// request.abort(); | |
// }else{ | |
// request.continue(); | |
// } | |
}); | |
return await crawl(); | |
}, | |
onSuccess: (result => { | |
// console.log(result.options.url); | |
}), | |
}); | |
await crawler.queue({ | |
url: args[0], | |
allowedDomains: [url.parse(args[0]).hostname], | |
}); | |
await crawler.onIdle(); | |
await crawler.close(); | |
requests = [ ...new Set(requests) ] | |
requests.forEach( function(value) { | |
console.log(value); | |
}); | |
})(); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Installation
yarn add headless-chrome-crawler
Usage
node crawler.js https://example.com/ 2