Skip to content

Instantly share code, notes, and snippets.

@sveetch
Last active August 29, 2015 14:03
Show Gist options
  • Save sveetch/9b39de11d72b8b4560d5 to your computer and use it in GitHub Desktop.
Save sveetch/9b39de11d72b8b4560d5 to your computer and use it in GitHub Desktop.
Emencia Site Analyzer, a PhantomJS script to analyze webpages
/*
* Emencia Site Analyzer, a PhantomJS script
* (built with PhantomJS 1.9.7)
*
*
* Crawl pages from a flat sitemap from a JSON, each page is analyzed on the
* network traffic for its ressources and eventually report raised Javascript error
*
*
* - Take one required argument that is the filepath to a JSON file (called the
* 'JSON sitemap') containing urls to analyze;
* - Take a second optional argument that is the filepath where the collected
* will be written;
*
* The JSON sitemap should be something like this :
*
* {
* "name": "Sample",
* "urls": [
* "http://192.168.0.103:8001/",
* "http://192.168.0.103:8001/contact/"
* ]
* }
*
*
* WARNING: PhantomJs like a browser in a normal way, will share the cache for all page
* instances and we can't access to used cached ressources, so it leads to a problem: after
* the first page, other pages that are on the same domain will not load the previously
* cached ressources, so load testing is not really complete. Either we use this script
* for only one page, either we can try to rebuild a custom PhantomJs with something
* like this patch https://github.com/ariya/phantomjs/pull/11511 and
* https://github.com/ariya/phantomjs/commit/5768b705a0
*
* Actually we should wait for the eventual PhantomJs 2.0 version to have a more stable tool.
*
* Ideas :
*
* - Manage chunk download;
* - Scan to search if the page contains iframe, store them if it so;
* - Better CLI interface than validate arguments, file, etc.. to avoid bad written files on human error;
*/
var AnalyzerMainProcess, AnalyzerPageFinished, AnalyzerProcessFinished,
database, pages_urls, dump_filepath, sitemap_file, sitemap,
program_name = "Emencia Site Analyzer",
program_version = "0.1",
fs = require('fs'),
system = require("system"),
phantom_version = phantom.version.major + '.' + phantom.version.minor + '.' + phantom.version.patch;
/*
/ Fake database to store collected datas
*/
database = {
pages: {},
/*
* Register a new page
*/
add_page: function(id, title, url, duration) {
this.pages[id] = {
'title': title,
'url': url,
'duration': duration,
'ressources': {},
'errors': []
};
return this.pages[id];
},
update_page_title: function(id, title) {
this.pages[id].title = title;
return this.pages[id];
},
/*
* Register a new page ressource
*/
add_error: function(page_id, content) {
this.pages[page_id].errors.push(content)
return this.pages[page_id];
},
/*
* Register a new page ressource
*/
add_ressource: function(id, page_id, url, start, end, status, content_type, length) {
this.pages[page_id].ressources[id] = {
'url': url,
'start': start,
'end': end,
'status': status,
'content_type': content_type,
'length': (length)?length:0
};
return this.pages[page_id].ressources[id];
},
/*
* Edit a page ressource
*/
edit_ressource: function(id, page_id, url, start, end, status, content_type, length) {
if(url) this.pages[page_id].ressources[id].url = url;
if(start) this.pages[page_id].ressources[id].start = start;
if(end) this.pages[page_id].ressources[id].end = end;
if(status) this.pages[page_id].ressources[id].status = status;
if(content_type) this.pages[page_id].ressources[id].content_type = content_type;
if(length) this.pages[page_id].ressources[id].length = length;
return this.pages[page_id].ressources[id];
}
};
/*
* Main Process function
*
* Proceed to an analyze for each given url
*
* @param array of URLs to open and analyze
* @param callbackPerUrl Function called after finishing each URL, including the last URL
* @param callbackFinal Function called after finishing everything
*/
AnalyzerMainProcess = function(urls, callbackPerUrl, callbackFinal) {
var next, retrieve,
page_id = 0,
webpage = require("webpage"),
page_instance = null;
// Close the page instance then call the callback for finished page analyse (where
// some stats should be written from this analyse)
next = function(status, url) {
page_instance.close();
callbackPerUrl(status, url);
return retrieve();
};
retrieve = function() {
var url;
if (urls.length > 0) {
url = urls.shift();
page_id++;
// Page instance init and customization
page_instance = webpage.create();
page_instance.viewportSize = {
width: 1920,
height: 1200
};
page_instance.settings.userAgent = program_name+"(v"+program_version+") ("+phantom_version+")";
// Register the new page
database.add_page(page_id, 'Unknow', url);
// Network traffic analyze
// Start ressource loading
page_instance.onResourceRequested = function(request) {
database.add_ressource(request.id, page_id, request.url, request.time);
};
// End ressource loading
page_instance.onResourceReceived = function(response) {
database.edit_ressource(response.id, page_id, null, null, response.time, response.status, response.contentType, response.bodySize);
};
// Manage Javascript error
// TODO: distinguish msg from traces
page_instance.onError = function(msg, trace) {
database.add_error(page_id, msg);
trace.forEach(function(item) {
database.add_error(page_id, item.file + ':' + item.line);
});
}
// Open the url and do the stuff
console.log("* Reading: " + url);
return page_instance.open(url, function(status) {
if (status === "success") {
return window.setTimeout((function() {
// Register the new page
database.update_page_title(page_id, page_instance.evaluate(function() {return document.title;}) );
return next(status, url);
}), 200);
} else {
// Call the function to cleanup and proceed to the next url
return next(status, url);
}
});
} else {
// All urls have been processed, call the final function
return callbackFinal();
}
};
return retrieve();
};
/*
* Function called when a page's analyze has been finished
*/
AnalyzerPageFinished = function(status, url) {
if (status !== "success") {
return console.log("└── Unable to read");
} else {
return console.log("└── Success");
}
};
/*
* Function called at the end of all analyze
*/
AnalyzerProcessFinished = function() {
// Get the analyse's JSON dump
var content = JSON.stringify(database.pages, undefined, 4);
// Write the JSON dump into a file
if(dump_filepath) {
fs.write(dump_filepath, content, 'w');
// .. or just output it if no given dump filepath
} else {
console.log(content);
}
return phantom.exit();
};
/*
* Launch the main process if required arguments are satisfied
*/
if (system.args.length < 2) {
console.log("Usage: phantomjs "+system.args[0]+" your_sitemap.json [dump_filepath.json]");
phantom.exit(1);
} else {
// Sitemap filepath from the first commandline argument
sitemap_file = system.args[1];
// Optional JSON dump writing to the given filepath in second argument
if (system.args.length > 2) {
dump_filepath = system.args[2];
}
// Open the JSON sitemap
sitemap = JSON.parse(fs.read(sitemap_file));
// Launch the main process to start the batch
AnalyzerMainProcess(sitemap.urls, AnalyzerPageFinished, AnalyzerProcessFinished);
}
@sveetch
Copy link
Author

sveetch commented Nov 22, 2014

We need an XML parser to use sitemap.xml in addition of the JSON sitemap.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment