Skip to content

Instantly share code, notes, and snippets.

@brookjordan
Created February 6, 2018 02:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brookjordan/9019642d32028d2f080e4be15103c82e to your computer and use it in GitHub Desktop.
Save brookjordan/9019642d32028d2f080e4be15103c82e to your computer and use it in GitHub Desktop.
Crawls your site and generates a list of internal pages accessible by crawling from the home page.
var baseURL = 'https://www.tradegecko.com'
var pages = {};
var pageURLs = ['/'];
var uncrawledPageURLs = pageURLs.slice(0);
var crawledPageURLs = [];
var openCrawls = 0;
var maxCrawls = 3;
findNewPages();
function findNewPages() {
if (uncrawledPageURLs.length === 0 || openCrawls >= maxCrawls) { return; }
let pageURL = uncrawledPageURLs.pop();
crawledPageURLs.push(pageURL);
openCrawls += 1;
findNewPages();
let pageObject = pages[pageURL] = {};
let pageWindow = window.open(`${ baseURL }${ pageURL }`, '_blank', 'width=1,height=1');
let xhr = new XMLHttpRequest();
xhr.open('GET', `${ baseURL }${ pageURL }`, true);
xhr.onload = e => {
if (+xhr.status < 400) {
pageWindow.onload = e => {
getAnchorsFromPage(pageWindow)
.then(hrefs => {
pageWindow.close();
let newPageURLs = hrefs.filter(href => pageURLs.indexOf(href) === -1);
pageURLs.push(...newPageURLs);
uncrawledPageURLs.push(...newPageURLs);
pageObject.references = hrefs;
openCrawls -= 1;
findNewPages();
});
};
} else {
pageObject.errors = ['Page failed to load'];
openCrawls -= 1;
findNewPages();
}
};
xhr.send(null);
}
function getAnchorsFromPage(win) {
return new Promise((res, rej) => {
setTimeout(() => {
let anchors = win.document.querySelectorAll('a');
res(Array.from(anchors)
.map(a => (a.href.split(baseURL)[1] || '').split('#')[0])
.filter(a => a && a.indexOf('.') === -1)
.filter((a, i, arr) => arr.indexOf(a) === i));
}, 2000);
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment