Skip to content

Instantly share code, notes, and snippets.

@wlepinski
Forked from joseraya/snapshot-crawler.js
Created February 3, 2014 12:45
Show Gist options
  • Save wlepinski/8783215 to your computer and use it in GitHub Desktop.
Save wlepinski/8783215 to your computer and use it in GitHub Desktop.
var Browser = require('zombie'),
url = require('url'),
fs = require('fs'),
$q = require('Q'),
saveDir = __dirname + '/_snapshots';
var scriptTagRegex = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi;
var stripScriptTags = function(html) {
return html.replace(scriptTagRegex, '');
}
var mkdirParent = function(dirPath, mode, callback) {
//Call the standard fs.mkdir
fs.mkdir(dirPath, mode, function(error) {
//When it fail in this way, do the custom steps
if (error && error.errno === 34) {
//Create all the parents recursively
fs.mkdirParent(path.dirname(dirPath), mode, callback);
//And then the directory
fs.mkdirParent(dirPath, mode, callback);
}
//Manually run the callback since we used our own callback to do all these
callback && callback(error);
});
};
var saveSnapshot = function(uri, body) {
var lastIdx = uri.lastIndexOf('#!/');
if (lastIdx < 0) {
// If we're using html5mode
path = url.parse(uri).pathname;
} else {
// If we're using hashbang mode
path =
uri.substring(lastIdx + 2, uri.length);
}
if (path === '/') path = "/index.html";
if (path.indexOf('.html') == -1)
path += ".html";
var filename = saveDir + path;
console.log("Saving ", uri, " to ", filename);
var dirname = require("path").dirname(filename);
mkdirParent(dirname);
fs.open(filename, 'w', function(e, fd) {
if (e) return;
fs.write(fd, body);
});
};
var browserOpts = {
waitFor: "100ms",
loadCSS: false,
waitDuration: "100ms"
}
var browser = new Browser(browserOpts);
var crawlPage = function(idx, arr) {
// location = window.location
if (idx < arr.length) {
var uri = arr[idx];
console.time("voy");
var promise = browser.visit(uri)
.then(function() {
console.timeEnd("voy");
var intervalId = setInterval(function() {
console.log("checking status")
var status = browser.body.getAttribute('data-status');
console.log(status);
if (status === "ready") {
clearInterval(intervalId);
// Turn links into absolute links
// and save them, if we need to
// and we haven't already crawled them
var links = browser.queryAll('a');
links.forEach(function(link) {
var href = link.getAttribute('href');
var absUrl = url.resolve(uri, href);
link.setAttribute('href', absUrl);
if (arr.indexOf(absUrl) < 0) {
arr.push(absUrl);
}
});
// Save
saveSnapshot(uri, browser.html());
// Call again on the next iteration
crawlPage(idx+1, arr);
}
}, 500);
var d = $q.defer();
});
}
}
crawlPage(0, ["http://localhost:4000/#!/"]);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment