Created
February 6, 2018 02:48
-
-
Save brookjordan/9019642d32028d2f080e4be15103c82e to your computer and use it in GitHub Desktop.
Crawls your site and generates a list of internal pages accessible by crawling from the home page.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var baseURL = 'https://www.tradegecko.com' | |
var pages = {}; | |
var pageURLs = ['/']; | |
var uncrawledPageURLs = pageURLs.slice(0); | |
var crawledPageURLs = []; | |
var openCrawls = 0; | |
var maxCrawls = 3; | |
findNewPages(); | |
function findNewPages() { | |
if (uncrawledPageURLs.length === 0 || openCrawls >= maxCrawls) { return; } | |
let pageURL = uncrawledPageURLs.pop(); | |
crawledPageURLs.push(pageURL); | |
openCrawls += 1; | |
findNewPages(); | |
let pageObject = pages[pageURL] = {}; | |
let pageWindow = window.open(`${ baseURL }${ pageURL }`, '_blank', 'width=1,height=1'); | |
let xhr = new XMLHttpRequest(); | |
xhr.open('GET', `${ baseURL }${ pageURL }`, true); | |
xhr.onload = e => { | |
if (+xhr.status < 400) { | |
pageWindow.onload = e => { | |
getAnchorsFromPage(pageWindow) | |
.then(hrefs => { | |
pageWindow.close(); | |
let newPageURLs = hrefs.filter(href => pageURLs.indexOf(href) === -1); | |
pageURLs.push(...newPageURLs); | |
uncrawledPageURLs.push(...newPageURLs); | |
pageObject.references = hrefs; | |
openCrawls -= 1; | |
findNewPages(); | |
}); | |
}; | |
} else { | |
pageObject.errors = ['Page failed to load']; | |
openCrawls -= 1; | |
findNewPages(); | |
} | |
}; | |
xhr.send(null); | |
} | |
function getAnchorsFromPage(win) { | |
return new Promise((res, rej) => { | |
setTimeout(() => { | |
let anchors = win.document.querySelectorAll('a'); | |
res(Array.from(anchors) | |
.map(a => (a.href.split(baseURL)[1] || '').split('#')[0]) | |
.filter(a => a && a.indexOf('.') === -1) | |
.filter((a, i, arr) => arr.indexOf(a) === i)); | |
}, 2000); | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment