leongersen/fetch.html

## fetch.html
<!DOCTYPE html>
<script>

	// Do anything in this function.
	function isOkForDomain (a) {
		a = a.toLowerCase();
		return a.indexOf('/booking/') === -1 &&
			a.indexOf('/beschikbaarheden/') === -1 &&
			a.indexOf('/reviews/') === -1 &&
			a.indexOf('/prijzen/') === -1 &&
			a.indexOf('/media/') === -1;
	}

</script>
<div id="count"></div>
<script>var DOMAIN = 'http://vialora.vp.local';</script>
<script src="fetch.js"></script>

## fetch.js

	function isOk(a){
		a = a.toLowerCase();
		return !a.endsWith('.ico') &&
			!a.endsWith('.js') &&
			!a.endsWith('.css') &&
			!a.endsWith('.jpg') &&
			!a.endsWith('.jpeg') &&
			!a.endsWith('.pdf') &&
			!a.endsWith('.gif') &&
			!a.endsWith('.png');
	}

		// Will contain all urls
	var handled = [],
		// Current number of pending requests
		crawling = 0,
		// Show the current number of open requests in de document
		count = document.getElementById('count'),
		// We're not parsing HTML.
		// Try to find everything between quotes, starting with '/' or 'http'.
		// Matches 'https' too.
		urlFinder = new RegExp('"((?:/|http)[^\"]+)"', "g"),
		// Find the host from the crawled domain
		HOST, PROTOCOL;

	(function(d){
		var p = document.createElement('a');
			p.href = d;
			HOST = p.hostname;
			PROTOCOL = p.protocol;
	}(DOMAIN));

	function crawl ( crawl_url ) {

		crawling++;
		count.innerText = crawling;

		handled.push(crawl_url);

		fetch(crawl_url, {
			method: 'get'
		}).catch(function(reason) {
			console.log('Caught failure: ' + reason);
		}).then(function(response) {
			return response.text();
		}).then(function(text) {
			return text.match(urlFinder);
		}).then(function(urls) {

			urls.forEach(function(url){
				var p = document.createElement('a');
				p.href = url.slice(1, -1);
				url = PROTOCOL + '//' + HOST + p.pathname;

				if ( handled.indexOf(url) === -1 && (p.hostname === 'localhost' || p.hostname === HOST) && isOk(p.pathname) && isOkForDomain(p.pathname) ) {
					crawl(url);
				}
			});

		}).then(function(filtered) {

			crawling--;
			count.innerText = crawling;

			if ( !crawling ) {
				console.log('Done. Found ' + handled.length + ' urls.');
				console.log(handled.join('\n'));
			}
		});
	}

	crawl(DOMAIN);
	<!DOCTYPE html>
	<script>

	// Do anything in this function.
	function isOkForDomain (a) {
	a = a.toLowerCase();
	return a.indexOf('/booking/') === -1 &&
	a.indexOf('/beschikbaarheden/') === -1 &&
	a.indexOf('/reviews/') === -1 &&
	a.indexOf('/prijzen/') === -1 &&
	a.indexOf('/media/') === -1;
	}

	</script>
	<div id="count"></div>
	<script>var DOMAIN = 'http://vialora.vp.local';</script>
	<script src="fetch.js"></script>

	function isOk(a){
	a = a.toLowerCase();
	return !a.endsWith('.ico') &&
	!a.endsWith('.js') &&
	!a.endsWith('.css') &&
	!a.endsWith('.jpg') &&
	!a.endsWith('.jpeg') &&
	!a.endsWith('.pdf') &&
	!a.endsWith('.gif') &&
	!a.endsWith('.png');
	}

	// Will contain all urls
	var handled = [],
	// Current number of pending requests
	crawling = 0,
	// Show the current number of open requests in de document
	count = document.getElementById('count'),
	// We're not parsing HTML.
	// Try to find everything between quotes, starting with '/' or 'http'.
	// Matches 'https' too.
	urlFinder = new RegExp('"((?:/\|http)[^\"]+)"', "g"),
	// Find the host from the crawled domain
	HOST, PROTOCOL;

	(function(d){
	var p = document.createElement('a');
	p.href = d;
	HOST = p.hostname;
	PROTOCOL = p.protocol;
	}(DOMAIN));

	function crawl ( crawl_url ) {

	crawling++;
	count.innerText = crawling;

	handled.push(crawl_url);

	fetch(crawl_url, {
	method: 'get'
	}).catch(function(reason) {
	console.log('Caught failure: ' + reason);
	}).then(function(response) {
	return response.text();
	}).then(function(text) {
	return text.match(urlFinder);
	}).then(function(urls) {

	urls.forEach(function(url){
	var p = document.createElement('a');
	p.href = url.slice(1, -1);
	url = PROTOCOL + '//' + HOST + p.pathname;

	if ( handled.indexOf(url) === -1 && (p.hostname === 'localhost' \|\| p.hostname === HOST) && isOk(p.pathname) && isOkForDomain(p.pathname) ) {
	crawl(url);
	}
	});

	}).then(function(filtered) {

	crawling--;
	count.innerText = crawling;

	if ( !crawling ) {
	console.log('Done. Found ' + handled.length + ' urls.');
	console.log(handled.join('\n'));
	}
	});
	}

	crawl(DOMAIN);