amoilanen/webcrawler.js

## webcrawler.js
//PhantomJS http://phantomjs.org/ based web crawler Anton Ivanov anton.al.ivanov@gmail.com 2012
//UPDATE: This gist has been made into a Node.js module and now can be installed with "npm install js-crawler"
//the Node.js version does not use Phantom.JS, but the API available to the client is similar to the present gist

(function(host) {

    function Crawler() {
        this.visitedURLs = {};
    };

    Crawler.webpage = require('webpage');

    Crawler.prototype.crawl = function (url, depth, onSuccess, onFailure) {
        if (0 == depth || this.visitedURLs[url]) {
            return;
        };
        var self = this;
        var page = Crawler.webpage.create();

        page.open(url, function (status) {
            if ('fail' === status) {
                onFailure({
                    url: url,
                    status: status
                });
            } else {
                var documentHTML = page.evaluate(function () {
                    return document.body && document.body.innerHTML ? document.body.innerHTML : "";
                });
                self.crawlURLs(self.getAllURLs(page), depth - 1, onSuccess, onFailure);
                self.visitedURLs[url] = true;
                onSuccess({
                    url: url,
                    status: status,
                    content: documentHTML
                });
            };
        });
    };

    Crawler.prototype.getAllURLs = function(page) {
        return page.evaluate(function () {
            return Array.prototype.slice.call(document.querySelectorAll("a"), 0)
                .map(function (link) {
                    return link.getAttribute("href");
                });
        });
    };

    Crawler.prototype.crawlURLs = function(urls, depth, onSuccess, onFailure) {
        var self = this;
        urls.filter(function (url) {
            return Crawler.isTopLevelURL(url);
        }).forEach(function (url) {
            self.crawl(url, depth, onSuccess, onFailure);
        });
    };

    Crawler.isTopLevelURL = function(url) {
        return 0 == url.indexOf("http");
    };

    host.Crawler = Crawler;
})(phantom);

new phantom.Crawler().crawl("https://github.com/ariya/phantomjs/wiki/Quick-Start", 2,
    function onSuccess(page) {
        console.log("Loaded page. URL = " + page.url + " content length = " + page.content.length + " status = " + page.status);
    },
    function onFailure(page) {
        console.log("Could not load page. URL = " +  page.url + " status = " + page.status);
    }
);
	//PhantomJS http://phantomjs.org/ based web crawler Anton Ivanov anton.al.ivanov@gmail.com 2012
	//UPDATE: This gist has been made into a Node.js module and now can be installed with "npm install js-crawler"
	//the Node.js version does not use Phantom.JS, but the API available to the client is similar to the present gist

	(function(host) {

	function Crawler() {
	this.visitedURLs = {};
	};

	Crawler.webpage = require('webpage');

	Crawler.prototype.crawl = function (url, depth, onSuccess, onFailure) {
	if (0 == depth \|\| this.visitedURLs[url]) {
	return;
	};
	var self = this;
	var page = Crawler.webpage.create();

	page.open(url, function (status) {
	if ('fail' === status) {
	onFailure({
	url: url,
	status: status
	});
	} else {
	var documentHTML = page.evaluate(function () {
	return document.body && document.body.innerHTML ? document.body.innerHTML : "";
	});
	self.crawlURLs(self.getAllURLs(page), depth - 1, onSuccess, onFailure);
	self.visitedURLs[url] = true;
	onSuccess({
	url: url,
	status: status,
	content: documentHTML
	});
	};
	});
	};

	Crawler.prototype.getAllURLs = function(page) {
	return page.evaluate(function () {
	return Array.prototype.slice.call(document.querySelectorAll("a"), 0)
	.map(function (link) {
	return link.getAttribute("href");
	});
	});
	};

	Crawler.prototype.crawlURLs = function(urls, depth, onSuccess, onFailure) {
	var self = this;
	urls.filter(function (url) {
	return Crawler.isTopLevelURL(url);
	}).forEach(function (url) {
	self.crawl(url, depth, onSuccess, onFailure);
	});
	};

	Crawler.isTopLevelURL = function(url) {
	return 0 == url.indexOf("http");
	};

	host.Crawler = Crawler;
	})(phantom);

	new phantom.Crawler().crawl("https://github.com/ariya/phantomjs/wiki/Quick-Start", 2,
	function onSuccess(page) {
	console.log("Loaded page. URL = " + page.url + " content length = " + page.content.length + " status = " + page.status);
	},
	function onFailure(page) {
	console.log("Could not load page. URL = " + page.url + " status = " + page.status);
	}
	);