Skip to content

Instantly share code, notes, and snippets.

@alexey-bass
Last active October 30, 2018 13:19
Show Gist options
  • Save alexey-bass/b6f0524e3cab00816a97 to your computer and use it in GitHub Desktop.
Save alexey-bass/b6f0524e3cab00816a97 to your computer and use it in GitHub Desktop.
Phantom Crawler
phantomjs --disk-cache=no crawler.js "http://yad2.co.il"
function log(msg) {
console.log(getTime()+ ': '+ msg);
}
function getTime() {
var d = new Date();
return pad(d.getHours()) +':'+ pad(d.getMinutes()) +':'+ pad(d.getSeconds()) +'.'+ rpad(d.getMilliseconds(), 3);
}
function pad(val, len) {
val = String(val);
len = len || 2;
while (val.length < len) {
val = '0'+ val;
}
return val;
};
function rpad(val, len) {
val = String(val);
len = len || 2;
while (val.length < len) {
val = val +'0';
}
return val;
};
log('PhantomJS v'+ phantom.version.major +'.'+ phantom.version.minor +'.'+ phantom.version.patch);
var page = require('webpage').create(),
system = require('system');
if (system.args.length !== 2) {
console.log('ERROR: Bad arguments count');
console.log('Usage: crawler.js URL');
phantom.exit(1);
}
var url = system.args[1];
page.paperSize = {format: 'A4', orientation: 'landscape', border: '5mm'};
page.viewportSize = {width: 1280, height: 800};
page.zoomFactor = 1;
page.settings.resourceTimeout = 300000; // max 5" allowed for resources (ajax, etc)
page.onResourceRequested = function(request) {
// log('REQUEST (#'+ request.id +') '+ request.method +' '+ request.url);
};
var resources = [0];
page.onResourceReceived = function(response) {
// log('RESPONSE(#'+ response.id +', '+ response.stage + '): '+ JSON.stringify(response));
switch (response.stage) {
case 'start':
resources[response.id] = [response.contentType.replace(/;.+/, ''), response.bodySize];
break;
}
};
page.onLoadFinished = function(status) {
log('Load finished in '+ (Date.now() - timeLoadStart) +' ms. Status: '+ status.toUpperCase());
resources.shift(); // remove first item = 0
var stats = {},
totalSize = 0,
i;
for (i in resources) {
if (!stats[resources[i][0]]) {
stats[resources[i][0]] = [1, resources[i][1]];
totalSize+= resources[i][1];
} else {
stats[resources[i][0]][0]++;
stats[resources[i][0]][1] += resources[i][1];
totalSize+= resources[i][1];
}
}
log('Resources stats ('+ (resources.length - 1) +' requests of '+ Math.round(totalSize / 1024) +' KB):');
for (i in stats) {
console.log('- '+ i +': '+ stats[i][0] +' req of '+ Math.round(stats[i][1] / 1024) +' KB');
}
};
log('Loading URL '+ url);
var timeLoadStart = Date.now();
page.open(url, function (status) {
log('DONE');
phantom.exit();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment