Skip to content

Instantly share code, notes, and snippets.

@tacryt-socryp
Created July 21, 2014 13:11
Show Gist options
  • Save tacryt-socryp/9bdecfc7abbab3ed15c0 to your computer and use it in GitHub Desktop.
Save tacryt-socryp/9bdecfc7abbab3ed15c0 to your computer and use it in GitHub Desktop.
Website Scraper using PhantomJS
/* globals require, __dirname, console, process, setTimeout, document */
var phantom = require("phantom"),
system = require("system"),
jsdom = require("jsdom"),
fs = require("fs"),
saveDir = __dirname + "/snapshot",
url = require("url"),
ph;
function createPhantom(base, arr) {
phantom.create("--disk-cache=no", function(result) {
ph = result;
crawlPage(base, 0, arr);
});
}
function crawlPage(base, i, arr) {
console.log("Page " + i + "/" + arr.length);
if (i < arr.length) {
var uri = arr[i];
ph.createPage(function(page) {
page.open(uri, function(status) {
if (status === "success") {
setTimeout(function() {
page.evaluate(
function() {
return document.documentElement.innerHTML;
},
function(result) {
jsdom.env(result, function (errors, window) {
var links = window.document.getElementsByTagName("link");
for (var x = 0; x < links.length; x++) {
var link = links[x];
var href = link.getAttribute("href");
if (href !== null) {
if (href.substring(0,1) === "/") {
href = base + href;
}
if (href.substring(0,4) === "http") {
link.setAttribute("href", href);
}
}
}
links = window.document.getElementsByTagName("a");
for (var x = 0; x < links.length; x++) {
var link = links[x];
var href = link.getAttribute("href");
if (href !== null) {
if (href.substring(0,1) === "/") {
href = base + href;
}
if (href.substring(0,4) === "http") {
link.setAttribute("href", href);
if (href.substring(0,1) !== "#" && arr.indexOf(href) < 0 &&
href.length >= base.length &&
href.substring(0, base.length) === base) {
arr.push(href);
console.log(href);
}
}
}
}
saveSnapshot(uri, window.document.documentElement.outerHTML);
crawlPage(base, i+1, arr);
});
});
}, 1500);
} else {
console.log(status);
}
});
});
} else {
console.log("Exiting...");
process.exit();
}
}
function mkdirParent(dirPath, mode, callback) {
fs.mkdir(dirPath, mode, function(error) {
if (error && error.errno === 34) {
mkdirParent(require("path").dirname(dirPath), mode, callback);
mkdirParent(dirPath, mode, callback);
}
if (callback) {
callback(error);
}
});
}
function saveSnapshot(uri, body) {
var lastIdx = uri.lastIndexOf("#!/"),
path;
if (lastIdx < 0) {
path = url.parse(uri).pathname;
} else {
path = uri.substring(lastIdx + 2, uri.length);
}
if (path === "/") {
path = "/index.html";
}
if (path.indexOf(".html") === -1) {
path += ".html";
}
var filename = saveDir + path;
var dirname = require("path").dirname(filename);
mkdirParent(dirname);
fs.exists(filename, function(exists) {
if (exists) {
console.log("Didn't have to write to a file!");
} else {
console.log("Saving ", uri, " to ", filename);
fs.open(filename, "w", function(e, fd) {
if (e) return;
fs.write(fd, body);
});
}
});
}
createPhantom("http://www.loganallen.co", [
"http://www.loganallen.co/index.html"
]);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment