Skip to content

Instantly share code, notes, and snippets.

@leongersen
Created November 5, 2015 22:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leongersen/70b8d85979a62f4a9b31 to your computer and use it in GitHub Desktop.
Save leongersen/70b8d85979a62f4a9b31 to your computer and use it in GitHub Desktop.
URL finder using fetch and promises. Logs a list of URLs on a domain to the console.
<!DOCTYPE html>
<script>
// Do anything in this function.
function isOkForDomain (a) {
a = a.toLowerCase();
return a.indexOf('/booking/') === -1 &&
a.indexOf('/beschikbaarheden/') === -1 &&
a.indexOf('/reviews/') === -1 &&
a.indexOf('/prijzen/') === -1 &&
a.indexOf('/media/') === -1;
}
</script>
<div id="count"></div>
<script>var DOMAIN = 'http://vialora.vp.local';</script>
<script src="fetch.js"></script>
function isOk(a){
a = a.toLowerCase();
return !a.endsWith('.ico') &&
!a.endsWith('.js') &&
!a.endsWith('.css') &&
!a.endsWith('.jpg') &&
!a.endsWith('.jpeg') &&
!a.endsWith('.pdf') &&
!a.endsWith('.gif') &&
!a.endsWith('.png');
}
// Will contain all urls
var handled = [],
// Current number of pending requests
crawling = 0,
// Show the current number of open requests in de document
count = document.getElementById('count'),
// We're not parsing HTML.
// Try to find everything between quotes, starting with '/' or 'http'.
// Matches 'https' too.
urlFinder = new RegExp('"((?:/|http)[^\"]+)"', "g"),
// Find the host from the crawled domain
HOST, PROTOCOL;
(function(d){
var p = document.createElement('a');
p.href = d;
HOST = p.hostname;
PROTOCOL = p.protocol;
}(DOMAIN));
function crawl ( crawl_url ) {
crawling++;
count.innerText = crawling;
handled.push(crawl_url);
fetch(crawl_url, {
method: 'get'
}).catch(function(reason) {
console.log('Caught failure: ' + reason);
}).then(function(response) {
return response.text();
}).then(function(text) {
return text.match(urlFinder);
}).then(function(urls) {
urls.forEach(function(url){
var p = document.createElement('a');
p.href = url.slice(1, -1);
url = PROTOCOL + '//' + HOST + p.pathname;
if ( handled.indexOf(url) === -1 && (p.hostname === 'localhost' || p.hostname === HOST) && isOk(p.pathname) && isOkForDomain(p.pathname) ) {
crawl(url);
}
});
}).then(function(filtered) {
crawling--;
count.innerText = crawling;
if ( !crawling ) {
console.log('Done. Found ' + handled.length + ' urls.');
console.log(handled.join('\n'));
}
});
}
crawl(DOMAIN);
@leongersen
Copy link
Author

You'll have to run a browser without web security to crawl external domains: chrome.exe --disable-web-security.

@leongersen
Copy link
Author

The number of concurrent request is set by the browser.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment