Skip to content

Instantly share code, notes, and snippets.

@twoixter
Created September 22, 2016 19:20
Show Gist options
  • Save twoixter/56f1d67f89e22d1b5b03888d5927c655 to your computer and use it in GitHub Desktop.
Save twoixter/56f1d67f89e22d1b5b03888d5927c655 to your computer and use it in GitHub Desktop.
Proof of concept of a filter node.js script to convert PhantomJSCloud JSON events to HAR
#!/usr/bin/env node
/**
* Script to convert PhantomJSCloud JSON output to HAR.
* See: https://phantomjscloud.com
*
* Usage: pipe in the JSON from PhantomJSCloud and pipe out to a file.
* Be sure to use the following options when using PhantomJSCloud:
*
* + outputAsJson = true To force JSON output
* + suppressJson = [] To not filter out any JSON data
*
* Example:
*
* curl -X POST https://phantomjscloud.com/api/browser/v2/a-demo-key-with-low-quota-per-ip-address/ \
* -d '{url:"http://www.etsy.com/",renderType:"text",outputAsJson:true,suppressJson:[]}' \
* | node pjsc2har.js > etsy.har
*
* TODO:
* **1** Means to know the HTTP version used.
* **2** Means to know cookies of each request.
* **3** Possible duplicated "resourceReceived" events withoud data. (E.G: Etsy page)
*
*/
var fs = require("fs");
var path = require("path");
var url = require("url");
var qs = require("querystring");
/**
* Process the PhantomJSCloud JSON from stdin and yield a JS Object as a result
*/
function fetchJSON(cb)
{
process.stdin.setEncoding("utf8");
var _input = "";
process.stdin.on("data", function(chunk) { _input += chunk; });
process.stdin.on("end", function() {
cb(JSON.parse(_input));
});
}
/**
* Helper methods for debugging. Trying not to mess with stdout so that we can
* save the HAR output by piping the node stdout output.
*/
function _debug(message)
{
process.stderr.write("\033[32m[DEBUG]\033[0m " + message + "\n");
}
function _error(message)
{
process.stderr.write("\033[91m[DEBUG]\033[0m " + message + "\n");
}
/**
* Parse URL params to a HAR compatible params array
*/
function params_to_array(params)
{
var _params = [];
for (param in params) {
_params.push({name: param, value: params[param]});
}
return _params;
}
/**
* Some global variables to help process. Yeah, not good, but this is a
* proof of concept anyway. :-)
*/
var _currentPage = {};
var _cookies = {};
var _theHAR = {};
var _resources = {};
var _activeNavigation = false;
/**
* Some event handlers to help transformation from PhanomJSCloud events into
* individual HAR entries.
*/
var eventHandlers = {
// A navigation event has been requested. Start over any previous session
// we had since this stablish a new load like loading a new "HTTPS" page
// while redirecting from "HTTP" 301 redirect.
navigationRequested: function(data, time) {
// If this is not a main navigation event, do nothing...
if (!data.main) return;
// Else, we got a new navigation event, perhaps as response to a
// redirection like 301, etc.
_debug("New navigation event to '" + data.url + "'. Starting over...");
_currentPage = {
"id": "page_1",
"title": data.url,
"startedDateTime": time,
"pageTimings": {
onContentLoad: 0, // Will be filled by "domReady"
onLoad: 0 // ...and "loadFinished" events
}
};
_activeNavigation = true;
_resources = {};
},
// Dummy event. Do nothing as all is done in the "navigationRequested"
loadStarted: function(data, time) { /** noop handler **/ },
// Dummy event. URL is already changed by "loadStarted"
urlChanged: function(data, time) { /** noop handler **/ },
// Dummy event. Do nothing on console messages.
consoleMessage: function(data, time) { /** noop handler **/ },
// Dummy event. Don't know what to do with these??
targetUrlReceived: function(data, time) { /** noop handler **/ },
// Throw this to the debug console...
browserError: function(data, time) { _error(data.message); },
// A new resource has been requested. Log this request in "_resources"
// only if a current _activeNavigation is enabled. This prevents adding
// resources to a different session (E.G: resources requested by a non
// "main" navigation request)
resourceRequested: function(data, time) {
// Do nothing if not actively navigating the current page.
if (!_activeNavigation) return;
var req = data.resourceRequest;
// WATFUK 1! A resourceReceived without an ID?
if (!req.id) {
_error("WAAAAAT: A resource without ID!!");
return;
}
// WATFUK 2! A current _resource is already activated with same ID? LOL
if (_resources[req.id]) {
_error("WAAAAAT: A resource with ID:'" + req.id + "' is already loading?");
// Continue anyway...
}
var resource = _resources[req.id] = {
startedDateTime: req.time,
cache: {},
time: -1,
pageref: "page_1"
};
resource.request = {
method: req.method,
url: req.url,
queryString: params_to_array(url.parse(req.url, true).query),
httpVersion: "HTTP1/1", // No means to know HTTP version. See **1**
headers: req.headers,
headersSize: -1,
bodySize: -1,
cookies: _cookies // Reuse cookies. See **2**
};
switch (req.method) {
case "POST":
// [[TODO]]
// OK, we assume an application/x-www-form-urlencoded
// We would need to search in headers for the Content-Type
// and fill this appropiately.
// Currently this is a mess. But at least we fill this fields
// so that HAR schema checkers are pleased.
resource.request.postData = {
mimeType: "application/x-www-form-urlencoded; charset=UTF-8",
params: params_to_array(qs.parse(req.postData))
}
break;
}
},
// A resource has been received. Calculate timings, etc
resourceReceived: function (data, time, idx) {
// Do nothing if not actively navigating the main session.
if (!_activeNavigation) return;
// Some "resourceReceived" events contains no "resourceResponse",
// might be duplicated events from PhantomJSCloud since the immediate
// previous event is for the same URL.
if (!data.resourceResponse) return;
var res = data.resourceResponse;
// WATFUK 1! A resourceReceived without an ID?
if (!res.id) {
_error("WAAAAAT: A resource received without ID!!");
return;
}
// WATFUK 2! A current _resource does not exists??
if (!_resources[res.id]) {
_error("WAAAAAT: A resource with ID:'" + res.id + "' does not exists!");
return;
}
var resource = _resources[res.id];
// Just bail out if this request was in error
if (resource.errorCode) return;
// Based on the current stage, we can do different things.
if (res.stage == "start") {
// Create the start of the response. We can fill in some data and
// start creating timings.
resource.responseStartedDateTime = res.time; // NOTE: Not part of HAR 1.2
resource.response = {
status: res.status,
statusText: res.statusText,
httpVersion: "HTTP1/1",
cookies: [],
headers: res.headers,
headersSize: -1,
redirectURL: res.redirectUrl || "",
bodySize: res.bodySize,
content: {
size: res.bodySize,
mimeType: res.contentType
}
}
} else if (res.stage == "end") {
// Special case for responses for entities for "data:*" Urls...
if (!res.status && !res.statusText) {
_debug("Deleted empty resource with Content-Type: " + res.contentType);
delete _resources[res.id];
return;
}
// May be we don't have a previous "start" stage for some events.
// I've seen a bunch of these for 301 redirects or 204 No content
if (!resource.response) {
resource.responseStartedDateTime = res.time; // NOTE: Not part of HAR 1.2
resource.response = {
status: res.status,
statusText: res.statusText,
httpVersion: "HTTP1/1",
cookies: [],
headers: res.headers,
headersSize: -1,
redirectURL: res.redirectUrl || "",
bodySize: res.bodySize || 0,
content: {
size: res.bodySize || 0,
mimeType: res.contentType || ""
}
};
}
// Time to calculate timings...
var startTime = new Date(resource.startedDateTime);
var startReceivingTime = new Date(resource.responseStartedDateTime);
var endTime = new Date(res.time);
resource.time = endTime - startTime;
resource.timings = {
blocked: 0,
dns: -1,
connect: -1,
send: 0,
wait: startReceivingTime - startTime,
receive: endTime - startReceivingTime,
ssl: -1
};
// Remove "responseStartedDateTime" since it is not par of HAR 1.2
delete resource.responseStartedDateTime;
} else {
_error("Unkown response stage '" + res.stage + "'");
}
},
// Some events reports as "resourceError", with a message like saying:
// "errorCode 5: Operation canceled". Don't know what is causing this,
// might be the QTWebKit cache inside PhantomJS cancelling parallels
// requests or something. In any case, we need to mark those.
resourceError: function(data, time) {
// Do nothing if not actively navigating the main session.
if (!_activeNavigation) return;
var err = data.resourceError;
// Mark the entry as error
_resources[err.id] = {
errorCode: err.errorCode,
errorString: err.errorString,
url: err.url
};
_error("Error " + err.errorCode + ": " + err.errorString + " for " + err.url);
},
// Mark domLoaded timings on the current page.
domReady: function(data, time) {
var startTime = new Date(_currentPage.startedDateTime);
var domTime = new Date(time);
_currentPage.pageTimings.onContentLoad = domTime - startTime;
},
// Stops loading more events... is that right?
loadFinished: function(data, time) {
var startTime = new Date(_currentPage.startedDateTime);
var endTime = new Date(time);
_currentPage.pageTimings.onLoad = endTime - startTime;
// Stop active navigation. I think this should filter out events for
// requests after loadFinished?? May be this is silly, test.
_activeNavigation = false;
}
};
/**
* Process the PhantomJSCloud JSON object and create a HAR
*/
function processHAR(obj)
{
// Extract the PNG to its own file
if (obj.content.data && (obj.content.encoding == "base64")) {
var filename = url.parse(obj.content.url, true).hostname + path.extname(obj.content.name);
fs.writeFile(filename, obj.content.data, obj.content.encoding);
}
// We can only process one page request currently.
if (obj.pageResponses.length != 1) {
_error("Multiple (or none) pages found.");
_error("I can only process one page at a time!");
process.exit();
}
var page = obj.pageResponses[0];
var version = JSON.parse(obj.meta.backend.platformVersion);
_theHAR = {
"log": {
"version": "1.2",
"creator": {
"name": "PhantomJSCloud",
"version": obj.meta.backend.id,
"comment": obj.meta.about
},
"browser": {
"name": obj.meta.backend.platform + " " + obj.meta.backend.os,
"version": version.major + "." + version.minor + "." + version.patch
},
"pages": [],
"entries": []
}
};
_cookies = page.cookies;
page.events.forEach(function(event, idx){
if (eventHandlers[event.key]) {
eventHandlers[event.key](event.value, event.time, idx);
} else {
_error("No handler for event '" + event.key + "' (idx:" + idx + ")");
}
});
// After all events had been processed, time to finish the HAR
_theHAR.log.pages.push(_currentPage);
for (resource in _resources) {
// Filter out cancelled requests/responses
if (!_resources[resource].errorCode) {
_theHAR.log.entries.push(_resources[resource]);
}
}
// Outputs the final HAR. This will go to stdout so we can pipe it out
console.log(JSON.stringify(_theHAR, null, 4));
// Some statistics to stderr. Uncomment to test differences in timing.
// _debug("Metrics as reported by PhantomJSCloud:");
// _debug(" * Elapsed time ms: " + page.metrics.elapsedMs);
// _debug(" * Starting time: " + page.metrics.startTime);
// _debug(" * Ending time: " + page.metrics.endTime);
//
// var startHARTime = new Date(_theHAR.log.pages[0].startedDateTime);
// var endHARTime = new Date(startHARTime.getTime() + _theHAR.log.pages[0].pageTimings.onLoad);
// _debug("Calculated HAR metrics");
// _debug(" * Elapsed time ms: " + _theHAR.log.pages[0].pageTimings.onLoad);
// _debug(" * Starting time: " + _theHAR.log.pages[0].startedDateTime);
// _debug(" * Ending time: " + endHARTime.toISOString());
}
fetchJSON(processHAR);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment