Skip to content

Instantly share code, notes, and snippets.

@mradcliffe
Forked from amoilanen/webcrawler.js
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mradcliffe/8832090 to your computer and use it in GitHub Desktop.
Save mradcliffe/8832090 to your computer and use it in GitHub Desktop.
//PhantomJS http://phantomjs.org/ based web crawler Anton Ivanov anton.al.ivanov@gmail.com 2012
var fs = require('fs');
var system = require('system');
var arg_url = system.args[1] || '';
var snapshot = system.args[2] || '';
fs.exists('sitemap.xml', function(exists) {
if (exists) {
fs.unlink('sitemap.xml');
}
});
var url_info = new Array();
var all_urls = new Array();
var sitemap_header = '<?xml version="1.0" encoding="UTF-8"?>' + "\n";
sitemap_header += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' + "\n";
var sitemap_footer = "\n</urlset>";
(function(host) {
function Crawler() {
this.visitedURLs = {};
};
Crawler.webpage = require('webpage');
Crawler.fs = require('fs');
Crawler.baseUrl = arg_url;
Crawler.prototype.crawl = function (url, depth, takeSnapshot, onSuccess, onFailure) {
if (0 == depth || this.visitedURLs[url]) {
return;
};
var self = this;
var page = Crawler.webpage.create();
page.viewportSize = {
width: 1280,
height: 1024
};
page.onResourceError = function(resourceError) {
page.reason = resourceError.errorString;
page.reason_url = resourceError.url;
}
page.open(url, function (status) {
if ('fail' === status) {
onFailure({
url: url,
status: status
});
} else {
var delay, checker = (function() {
console.log('Waiting for page load... ' + url);
var documentHTML = page.evaluate(function () {
var body = document.querySelector('.main-wrapper');
if (body.getAttribute('data-status') == 'ready') {
return document.body && document.body.innerHTML ? document.body : "";
}
});
if (documentHTML) {
clearTimeout(delay);
// Save content into directory.
if (page.url.substring(0, Crawler.baseUrl.length) == Crawler.baseUrl) {
var resource = page.url.substring(Crawler.baseUrl.length);
resource = resource.replace(/^#[!]?/, '');
if (resource.length == 0) {
resource = resource + "/index.html";
} else if (resource.match(/\.html$/) === null) {
resource = resource + ".html";
} else {
resource = resource + ".html";
}
fs.write('content/static/' + resource, page.content, 'w');
if (takeSnapshot) {
page.render('tmp/' + resource + '.png');
}
if (all_urls.indexOf(page.url) == -1) {
url_info.push("\t<url><loc>" + page.url + "</loc><changefreq>monthly</changefreq></url>");
all_urls.push(page.url);
}
// I am going to write this file every single time because I do not understand Javascript.
fs.write("sitemap.xml", sitemap_header + url_info.join("\n") + sitemap_footer, 'w');
}
// Add URL to JSON
self.getAllURLs(page, depth -1, onSuccess, onFailure);
// self.crawlURLs(urls, depth - 1, onSuccess, onFailure);
self.visitedURLs[url] = true;
onSuccess({
url: url,
status: status,
content: documentHTML
});
}
});
delay = setInterval(checker, 100);
};
});
};
Crawler.prototype.getAllURLs = function(page, depth, onSuccess, onFailure) {
var self = this;
var delay, checker = (function() {
console.log('Waiting for navigation links to load...');
var links = page.evaluate(function () {
// Try to find some links in a couple of different menus on the site.
var nav = document.querySelector('a.ng-binding, a.nav-link');
// var body = document.getElementsByTagName('body')[0];
if (typeof nav !== 'undefined' || nav !== null) {
return [].map.call(document.querySelectorAll('a.ng-binding, a.nav-link'), function(link) {
return link.getAttribute('href');
});
}
});
if (links) {
clearTimeout(delay);
console.log('Found ' + links.length + ' links');
self.crawlURLs(links, depth, onSuccess, onFailure);
}
});
delay = setInterval(checker, 100);
};
Crawler.prototype.crawlURLs = function(urls, depth, onSuccess, onFailure) {
var self = this;
var baseUrl = arg_url;
urls.filter(function (url) {
if (url.match(/^#[!]?\/[a-zA-Z0-9\-]+/) !== null && url !== '#[!]?/') {
return Crawler.isTopLevelURL(baseUrl + url);
} else if (url.length !== '#[!]?/') {
return Crawler.isTopLevelURL(url);
} else {
return false;
}
}).forEach(function (url) {
console.log('Found: ' + url);
self.crawl(baseUrl + url, depth, onSuccess, onFailure);
});
};
Crawler.isTopLevelURL = function(url) {
// return true;
return 0 == url.indexOf("http");
};
host.Crawler = Crawler;
})(phantom);
new phantom.Crawler().crawl(arg_url, 4, snapshot,
function onSuccess(page) {
console.log("Loaded page. URL = " + page.url + " content length = " + page.content.length + " status = " + page.status);
},
function onFailure(page) {
console.log("Could not load page. URL = " + page.url + " status = " + page.status);
}
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment