pcampina/crawInternalUrls.js

## crawInternalUrls.js
/*
  Author: Evyatar Meged
  Collaborator: syedshabbir
  Source: https://stackoverflow.com/questions/50154133/how-to-crawl-all-the-internal-urls-of-a-website-using-crawler
*/
const Crawler = require('crawler');

let obselete = []; // Array of what was crawled already

let c = new Crawler();

function crawlAllUrls(url) {
    console.log(`Crawling ${url}`);
    c.queue({
        uri: url,
        callback: function (err, res, done) {
            if (err) throw err;
            let $ = res.$;
            try {
                let urls = $('a');
                Object.keys(urls).forEach((item) => {
                    if (urls[item].type === 'tag') {
                        let href = urls[item].attribs.href;
                        if (href && !obselete.includes(href)) {
                            href = href.trim();
                            obselete.push(href);
                            // Slow down the
                            setTimeout(function() {
                              href.startsWith(url) ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`)
                            }, 5000)

                        }
                    }
                });
            } catch (e) {
                console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
                done()

            }
            done();
        }
    })
}

crawlAllUrls('https://github.com/evyatarmeged/');
	/*
	Author: Evyatar Meged
	Collaborator: syedshabbir
	Source: https://stackoverflow.com/questions/50154133/how-to-crawl-all-the-internal-urls-of-a-website-using-crawler
	*/
	const Crawler = require('crawler');

	let obselete = []; // Array of what was crawled already

	let c = new Crawler();

	function crawlAllUrls(url) {
	console.log(`Crawling ${url}`);
	c.queue({
	uri: url,
	callback: function (err, res, done) {
	if (err) throw err;
	let $ = res.$;
	try {
	let urls = $('a');
	Object.keys(urls).forEach((item) => {
	if (urls[item].type === 'tag') {
	let href = urls[item].attribs.href;
	if (href && !obselete.includes(href)) {
	href = href.trim();
	obselete.push(href);
	// Slow down the
	setTimeout(function() {
	href.startsWith(url) ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`)
	}, 5000)

	}
	}
	});
	} catch (e) {
	console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
	done()

	}
	done();
	}
	})
	}

	crawlAllUrls('https://github.com/evyatarmeged/');