Skip to content

Instantly share code, notes, and snippets.

@pcampina
Created March 3, 2022 13:31
Show Gist options
  • Save pcampina/74348f082675b777b05bfac7ba2a0866 to your computer and use it in GitHub Desktop.
Save pcampina/74348f082675b777b05bfac7ba2a0866 to your computer and use it in GitHub Desktop.
Craw All Internal URL's with Crawler npm package
/*
Author: Evyatar Meged
Collaborator: syedshabbir
Source: https://stackoverflow.com/questions/50154133/how-to-crawl-all-the-internal-urls-of-a-website-using-crawler
*/
const Crawler = require('crawler');
let obselete = []; // Array of what was crawled already
let c = new Crawler();
function crawlAllUrls(url) {
console.log(`Crawling ${url}`);
c.queue({
uri: url,
callback: function (err, res, done) {
if (err) throw err;
let $ = res.$;
try {
let urls = $('a');
Object.keys(urls).forEach((item) => {
if (urls[item].type === 'tag') {
let href = urls[item].attribs.href;
if (href && !obselete.includes(href)) {
href = href.trim();
obselete.push(href);
// Slow down the
setTimeout(function() {
href.startsWith(url) ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`)
}, 5000)
}
}
});
} catch (e) {
console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
done()
}
done();
}
})
}
crawlAllUrls('https://github.com/evyatarmeged/');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment