Skip to content

Instantly share code, notes, and snippets.

@weeksdev
Created November 27, 2015 22:53
Show Gist options
  • Save weeksdev/a196a0c534d0156a639d to your computer and use it in GitHub Desktop.
Save weeksdev/a196a0c534d0156a639d to your computer and use it in GitHub Desktop.
PhantomJs Escaped Fragment Crawler
var fs = require('fs'),
//root url to start crawl from, it will only look for hashes in a links for the specified baseUrl
//so for instance, if you had a link to some other website http://www.abc.com/something-here it's NOT going to parse it
baseUrl = 'http://localhost:3000/',
//folder to save the html pages to
saveFolder = 'public/_escaped_fragment_/',
//array containing every link already parsed
parsedLinks = [];
//method to parse given url/hash and iterator that recursively calls the additional pages
var checkPage = function (page, url, hash) {
parsedLinks.push(url + hash);
page.open(url + hash, function (status) {
var filePath = hash.replace('#!', '').replace('#', '') + '.html';
if (filePath === '.html') {
filePath = 'Index.html';
}
console.log(filePath);
setTimeout(function (){
var content = page.evaluate(function () {
return document.body.innerHTML;
});
fs.write(saveFolder + filePath, content, 'w');
var hashes = page.evaluate(function () {
var elements = document.getElementsByTagName('a');
hashes = [];
for (var i = 0; i < elements.length; i++) {
if (elements[i].hash !== '' && elements[i].href.indexOf(location.href) === 0) {
hashes.push(elements[i].hash);
}
}
return hashes
});
hashes.forEach(function (hash) {
if (parsedLinks.indexOf(url + hash) == -1) {
checkPage(require('webpage').create(), url, hash);
}
});
}, 2000)
});
};
//start the crawl
checkPage(require('webpage').create(), baseUrl, '');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment