sveetch/analyzer.js

## analyzer.js
/*
 * Emencia Site Analyzer, a PhantomJS script
 * (built with PhantomJS 1.9.7)
 *
 *
 * Crawl pages from a flat sitemap from a JSON, each page is analyzed on the
 * network traffic for its ressources and eventually report raised Javascript error
 *
 *
 * - Take one required argument that is the filepath to a JSON file (called the
 *   'JSON sitemap') containing urls to analyze;
 * - Take a second optional argument that is the filepath where the collected
 *   will be written;
 *
 * The JSON sitemap should be something like this :
 *
 * {
 *    "name": "Sample",
 *    "urls": [
 *        "http://192.168.0.103:8001/",
 *        "http://192.168.0.103:8001/contact/"
 *    ]
 * }
 *
 *
 * WARNING: PhantomJs like a browser in a normal way, will share the cache for all page
 * instances and we can't access to used cached ressources, so it leads to a problem: after
 * the first page, other pages that are on the same domain will not load the previously
 * cached ressources, so load testing is not really complete. Either we use this script
 * for only one page, either we can try to rebuild a custom PhantomJs with something
 * like this patch https://github.com/ariya/phantomjs/pull/11511 and
 * https://github.com/ariya/phantomjs/commit/5768b705a0
 *
 * Actually we should wait for the eventual PhantomJs 2.0 version to have a more stable tool.
 *
 * Ideas :
 *
 * - Manage chunk download;
 * - Scan to search if the page contains iframe, store them if it so;
 * - Better CLI interface than validate arguments, file, etc.. to avoid bad written files on human error;
 */

var AnalyzerMainProcess, AnalyzerPageFinished, AnalyzerProcessFinished,
    database, pages_urls, dump_filepath, sitemap_file, sitemap,
    program_name = "Emencia Site Analyzer",
    program_version = "0.1",
    fs = require('fs'),
    system = require("system"),
    phantom_version = phantom.version.major + '.' + phantom.version.minor + '.' + phantom.version.patch;


/*
/ Fake database to store collected datas
*/
database = {
    pages: {},

    /*
     * Register a new page
     */
    add_page: function(id, title, url, duration) {
        this.pages[id] = {
            'title': title,
            'url': url,
            'duration': duration,
            'ressources': {},
            'errors': []
        };
        return this.pages[id];
    },
    update_page_title: function(id, title) {
        this.pages[id].title = title;
        return this.pages[id];
    },

    /*
     * Register a new page ressource
     */
    add_error: function(page_id, content) {
        this.pages[page_id].errors.push(content)
        return this.pages[page_id];
    },

    /*
     * Register a new page ressource
     */
    add_ressource: function(id, page_id, url, start, end, status, content_type, length) {
        this.pages[page_id].ressources[id] = {
            'url': url,
            'start': start,
            'end': end,
            'status': status,
            'content_type': content_type,
            'length': (length)?length:0
        };
        return this.pages[page_id].ressources[id];
    },

    /*
     * Edit a page ressource
     */
    edit_ressource: function(id, page_id, url, start, end, status, content_type, length) {
        if(url) this.pages[page_id].ressources[id].url = url;
        if(start) this.pages[page_id].ressources[id].start = start;
        if(end) this.pages[page_id].ressources[id].end = end;
        if(status) this.pages[page_id].ressources[id].status = status;
        if(content_type) this.pages[page_id].ressources[id].content_type = content_type;
        if(length) this.pages[page_id].ressources[id].length = length;

        return this.pages[page_id].ressources[id];
    }
};


/*
 * Main Process function
 *
 * Proceed to an analyze for each given url
 *
 * @param array of URLs to open and analyze
 * @param callbackPerUrl Function called after finishing each URL, including the last URL
 * @param callbackFinal Function called after finishing everything
*/
AnalyzerMainProcess = function(urls, callbackPerUrl, callbackFinal) {
    var next, retrieve,
        page_id = 0,
        webpage = require("webpage"),
        page_instance = null;

    // Close the page instance then call the callback for finished page analyse (where
    // some stats should be written from this analyse)
    next = function(status, url) {
        page_instance.close();
        callbackPerUrl(status, url);
        return retrieve();
    };

    retrieve = function() {
        var url;
        if (urls.length > 0) {
            url = urls.shift();
            page_id++;

            // Page instance init and customization
            page_instance = webpage.create();
            page_instance.viewportSize = {
                width: 1920,
                height: 1200
            };
            page_instance.settings.userAgent = program_name+"(v"+program_version+") ("+phantom_version+")";

            // Register the new page
            database.add_page(page_id, 'Unknow', url);

            // Network traffic analyze
            // Start ressource loading
            page_instance.onResourceRequested = function(request) {
                database.add_ressource(request.id, page_id, request.url, request.time);
            };
            // End ressource loading
            page_instance.onResourceReceived = function(response) {
                database.edit_ressource(response.id, page_id, null, null, response.time, response.status, response.contentType, response.bodySize);
            };
            // Manage Javascript error
            // TODO: distinguish msg from traces
            page_instance.onError = function(msg, trace) {
                database.add_error(page_id, msg);
                trace.forEach(function(item) {
                    database.add_error(page_id, item.file + ':' + item.line);
                });
            }


            // Open the url and do the stuff
            console.log("* Reading: " + url);
            return page_instance.open(url, function(status) {
                if (status === "success") {
                    return window.setTimeout((function() {
                        // Register the new page
                        database.update_page_title(page_id, page_instance.evaluate(function() {return document.title;}) );
                        return next(status, url);
                    }), 200);
                } else {
                    // Call the function to cleanup and proceed to the next url
                    return next(status, url);
                }
            });
        } else {
            // All urls have been processed, call the final function
            return callbackFinal();
        }
    };
    return retrieve();
};


/*
 * Function called when a page's analyze has been finished
 */
AnalyzerPageFinished = function(status, url) {
    if (status !== "success") {
        return console.log("└── Unable to read");
    } else {
        return console.log("└── Success");
    }
};


/*
 * Function called at the end of all analyze
 */
AnalyzerProcessFinished = function() {
    // Get the analyse's JSON dump
    var content = JSON.stringify(database.pages, undefined, 4);

    // Write the JSON dump into a file
    if(dump_filepath) {
        fs.write(dump_filepath, content, 'w');
    // .. or just output it if no given dump filepath
    } else {
        console.log(content);
    }

    return phantom.exit();
};


/*
 * Launch the main process if required arguments are satisfied
 */
if (system.args.length < 2) {
    console.log("Usage: phantomjs "+system.args[0]+" your_sitemap.json [dump_filepath.json]");
    phantom.exit(1);
} else {
    // Sitemap filepath from the first commandline argument
    sitemap_file = system.args[1];
    // Optional JSON dump writing to the given filepath in second argument
    if (system.args.length > 2) {
        dump_filepath = system.args[2];
    }

    // Open the JSON sitemap
    sitemap = JSON.parse(fs.read(sitemap_file));

    // Launch the main process to start the batch
    AnalyzerMainProcess(sitemap.urls, AnalyzerPageFinished, AnalyzerProcessFinished);
}
	/*
	* Emencia Site Analyzer, a PhantomJS script
	* (built with PhantomJS 1.9.7)
	*
	*
	* Crawl pages from a flat sitemap from a JSON, each page is analyzed on the
	* network traffic for its ressources and eventually report raised Javascript error
	*
	*
	* - Take one required argument that is the filepath to a JSON file (called the
	* 'JSON sitemap') containing urls to analyze;
	* - Take a second optional argument that is the filepath where the collected
	* will be written;
	*
	* The JSON sitemap should be something like this :
	*
	* {
	* "name": "Sample",
	* "urls": [
	* "http://192.168.0.103:8001/",
	* "http://192.168.0.103:8001/contact/"
	* ]
	* }
	*
	*
	* WARNING: PhantomJs like a browser in a normal way, will share the cache for all page
	* instances and we can't access to used cached ressources, so it leads to a problem: after
	* the first page, other pages that are on the same domain will not load the previously
	* cached ressources, so load testing is not really complete. Either we use this script
	* for only one page, either we can try to rebuild a custom PhantomJs with something
	* like this patch https://github.com/ariya/phantomjs/pull/11511 and
	* https://github.com/ariya/phantomjs/commit/5768b705a0
	*
	* Actually we should wait for the eventual PhantomJs 2.0 version to have a more stable tool.
	*
	* Ideas :
	*
	* - Manage chunk download;
	* - Scan to search if the page contains iframe, store them if it so;
	* - Better CLI interface than validate arguments, file, etc.. to avoid bad written files on human error;
	*/

	var AnalyzerMainProcess, AnalyzerPageFinished, AnalyzerProcessFinished,
	database, pages_urls, dump_filepath, sitemap_file, sitemap,
	program_name = "Emencia Site Analyzer",
	program_version = "0.1",
	fs = require('fs'),
	system = require("system"),
	phantom_version = phantom.version.major + '.' + phantom.version.minor + '.' + phantom.version.patch;


	/*
	/ Fake database to store collected datas
	*/
	database = {
	pages: {},

	/*
	* Register a new page
	*/
	add_page: function(id, title, url, duration) {
	this.pages[id] = {
	'title': title,
	'url': url,
	'duration': duration,
	'ressources': {},
	'errors': []
	};
	return this.pages[id];
	},
	update_page_title: function(id, title) {
	this.pages[id].title = title;
	return this.pages[id];
	},

	/*
	* Register a new page ressource
	*/
	add_error: function(page_id, content) {
	this.pages[page_id].errors.push(content)
	return this.pages[page_id];
	},

	/*
	* Register a new page ressource
	*/
	add_ressource: function(id, page_id, url, start, end, status, content_type, length) {
	this.pages[page_id].ressources[id] = {
	'url': url,
	'start': start,
	'end': end,
	'status': status,
	'content_type': content_type,
	'length': (length)?length:0
	};
	return this.pages[page_id].ressources[id];
	},

	/*
	* Edit a page ressource
	*/
	edit_ressource: function(id, page_id, url, start, end, status, content_type, length) {
	if(url) this.pages[page_id].ressources[id].url = url;
	if(start) this.pages[page_id].ressources[id].start = start;
	if(end) this.pages[page_id].ressources[id].end = end;
	if(status) this.pages[page_id].ressources[id].status = status;
	if(content_type) this.pages[page_id].ressources[id].content_type = content_type;
	if(length) this.pages[page_id].ressources[id].length = length;

	return this.pages[page_id].ressources[id];
	}
	};


	/*
	* Main Process function
	*
	* Proceed to an analyze for each given url
	*
	* @param array of URLs to open and analyze
	* @param callbackPerUrl Function called after finishing each URL, including the last URL
	* @param callbackFinal Function called after finishing everything
	*/
	AnalyzerMainProcess = function(urls, callbackPerUrl, callbackFinal) {
	var next, retrieve,
	page_id = 0,
	webpage = require("webpage"),
	page_instance = null;

	// Close the page instance then call the callback for finished page analyse (where
	// some stats should be written from this analyse)
	next = function(status, url) {
	page_instance.close();
	callbackPerUrl(status, url);
	return retrieve();
	};

	retrieve = function() {
	var url;
	if (urls.length > 0) {
	url = urls.shift();
	page_id++;

	// Page instance init and customization
	page_instance = webpage.create();
	page_instance.viewportSize = {
	width: 1920,
	height: 1200
	};
	page_instance.settings.userAgent = program_name+"(v"+program_version+") ("+phantom_version+")";

	// Register the new page
	database.add_page(page_id, 'Unknow', url);

	// Network traffic analyze
	// Start ressource loading
	page_instance.onResourceRequested = function(request) {
	database.add_ressource(request.id, page_id, request.url, request.time);
	};
	// End ressource loading
	page_instance.onResourceReceived = function(response) {
	database.edit_ressource(response.id, page_id, null, null, response.time, response.status, response.contentType, response.bodySize);
	};
	// Manage Javascript error
	// TODO: distinguish msg from traces
	page_instance.onError = function(msg, trace) {
	database.add_error(page_id, msg);
	trace.forEach(function(item) {
	database.add_error(page_id, item.file + ':' + item.line);
	});
	}


	// Open the url and do the stuff
	console.log("* Reading: " + url);
	return page_instance.open(url, function(status) {
	if (status === "success") {
	return window.setTimeout((function() {
	// Register the new page
	database.update_page_title(page_id, page_instance.evaluate(function() {return document.title;}) );
	return next(status, url);
	}), 200);
	} else {
	// Call the function to cleanup and proceed to the next url
	return next(status, url);
	}
	});
	} else {
	// All urls have been processed, call the final function
	return callbackFinal();
	}
	};
	return retrieve();
	};


	/*
	* Function called when a page's analyze has been finished
	*/
	AnalyzerPageFinished = function(status, url) {
	if (status !== "success") {
	return console.log("└── Unable to read");
	} else {
	return console.log("└── Success");
	}
	};


	/*
	* Function called at the end of all analyze
	*/
	AnalyzerProcessFinished = function() {
	// Get the analyse's JSON dump
	var content = JSON.stringify(database.pages, undefined, 4);

	// Write the JSON dump into a file
	if(dump_filepath) {
	fs.write(dump_filepath, content, 'w');
	// .. or just output it if no given dump filepath
	} else {
	console.log(content);
	}

	return phantom.exit();
	};


	/*
	* Launch the main process if required arguments are satisfied
	*/
	if (system.args.length < 2) {
	console.log("Usage: phantomjs "+system.args[0]+" your_sitemap.json [dump_filepath.json]");
	phantom.exit(1);
	} else {
	// Sitemap filepath from the first commandline argument
	sitemap_file = system.args[1];
	// Optional JSON dump writing to the given filepath in second argument
	if (system.args.length > 2) {
	dump_filepath = system.args[2];
	}

	// Open the JSON sitemap
	sitemap = JSON.parse(fs.read(sitemap_file));

	// Launch the main process to start the batch
	AnalyzerMainProcess(sitemap.urls, AnalyzerPageFinished, AnalyzerProcessFinished);
	}