Skip to content

Instantly share code, notes, and snippets.

@kaizhu256
Created April 16, 2015 03:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaizhu256/cbbf499fb7d6918060cf to your computer and use it in GitHub Desktop.
Save kaizhu256/cbbf499fb7d6918060cf to your computer and use it in GitHub Desktop.
phantomjs web crawler
/*jslint
bitwise: true,
browser: true,
maxerr: 8,
maxlen: 96,
node: true,
nomen: true,
stupid: true
*/
(function (self) {
'use strict';
var local;
// run shared js-env code
(function () {
// init local
local = {};
local.modeJs = (function () {
try {
return self.phantom.version &&
typeof require('webpage').create === 'function' &&
'phantom';
} catch (errorCaughtPhantom) {
try {
return module.exports &&
typeof process.versions.node === 'string' &&
typeof require('http').createServer === 'function' &&
'node';
} catch (errorCaughtNode) {
return typeof navigator.userAgent === 'string' &&
typeof document.querySelector('body') === 'object' &&
'browser';
}
}
}());
// init debug_print
local['debug_print'.replace('_p', 'P')] = function (arg) {
/*
this function will both print the arg to stderr and return it
*/
// debug arguments
local['_debug_printArguments'.replace('_p', 'P')] = arguments;
console.error('\n\n\ndebug_print'.replace('_p', 'P'));
console.error.apply(console, arguments);
console.error();
// return arg for inspection
return arg;
};
local.timeoutDefault = 30000;
}());
switch (local.modeJs) {
// run node js-env code
case 'node':
// require modules
local.child_process = require('child_process');
local.fs = require('fs');
local.path = require('path');
local.repl = require('repl');
local.url = require('url');
// init local properties
local.onError = function (error) {
var keyList;
if (error) {
console.error(error.stack);
}
while (true) {
keyList = Object.keys(local.taskDict || {}).sort();
// init npm_config_tasks_running_max
process.env.npm_config_tasks_running_max =
process.env.npm_config_tasks_running_max || 4;
if (local.tasksRunning >=
(Number(process.env.npm_config_tasks_running_max) || 0) ||
keyList.length === 0) {
break;
}
// init urlVisitedDict
local.urlVisitedDict = local.urlVisitedDict || {};
if (!local.urlVisitedDict[keyList[0]] &&
!local.fs.existsSync(encodeURIComponent(keyList[0]) + '.json')) {
local.urlVisitedDict[keyList[0]] = true;
local.taskDict[keyList[0]]();
}
delete local.taskDict[keyList[0]];
}
if (local.tasksRunning === 0) {
console.log('tasks finished');
process.exit();
}
};
// start repl
global.local = local;
local.repl.start({ useGlobal: true });
local.processSpawn = function (arg0, argList) {
var childProcess, modeNext, onNext, url0;
modeNext = 0;
onNext = function (error, data) {
/* jslint-indent-begin 16 */
/*jslint maxlen: 112*/
modeNext = error instanceof Error
? NaN
: modeNext + 1;
try {
switch (modeNext) {
case 1:
arg0 = arg0 || process.argv[2];
argList = argList || process.argv.slice(3);
url0 = local._debugUrl0 = argList[0];
// init and increment tasksRunning
local.tasksRunning = local.tasksRunning || 0;
local.tasksRunning += 1;
childProcess = local.child_process.spawn(
arg0,
[__filename].concat(argList),
{ stdio: ['ignore', 1, 2] }
)
.on('error', local.onError)
.on('exit', onNext);
// init timerTimeout
childProcess.timerTimeout = local.child_process.spawn('/bin/sh', [
'-c',
'sleep ' +
// coerce to finite integer
((0.001 * local.timeoutDefault) | 0) +
'; kill -9 ' + childProcess.pid + ' 2>/dev/null'
], { stdio: 'ignore' }).unref();
break;
case 2:
local.tasksRunning -= 1;
data = local._debugData = JSON.parse(
local.fs.readFileSync(encodeURIComponent(url0) + '.json')
);
// init taskDict
local.taskDict = local.taskDict || {};
data.aHrefList.forEach(function (url) {
// init npm_config_url_depth
process.env.npm_config_url_depth = process.env.npm_config_url_depth || 1;
if (!(/[\?\#]/).test(url) &&
!(local.urlVisitedDict || {})[url] &&
url.split('/').length <= Number(process.env.npm_config_url_depth) + 3 &&
local.url.parse(url).host === local.url.parse(url0).host) {
local.taskDict[url] = local.taskDict[url] || function () {
local.processSpawn(arg0, [url].concat(argList.slice(1)));
};
}
});
onNext();
break;
default:
local.onError(error);
}
} catch (errorCaught) {
local.onError(error);
}
/* jslint-indent-end */
};
onNext();
};
local.processSpawn();
break;
// run phantom js-env code
case 'phantom':
// require modules
local.fs = require('fs');
local.system = require('system');
local.webpage = require('webpage');
// init url
local.url = local.system.args[1];
// init file
local.fileScreenCapture = encodeURIComponent(local.url) + '.png';
local.fileJson = encodeURIComponent(local.url) + '.json';
// init webpage
local.page = local.webpage.create();
// init webpage clipRect
local.page.clipRect = { height: 768, left: 0, top: 0, width: 1024 };
// init webpage viewportSize
local.page.viewportSize = { height: 768, width: 1024 };
// init webpage error handling - http://phantomjs.org/api/webpage/handler/on-error.html
local.page.onError = local.onError;
// pipe webpage console.log to stderr
local.page.onConsoleMessage = function () {
console.error.apply(
console,
[local.url].concat(Array.prototype.slice.call(arguments))
);
};
// init userAgent
local.page.settings.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)' +
'AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2';
// init time
local.timeBegin = Date.now();
// open requested webpage
local.page.open(local.url, function (error, trace) {
local.timeEnd = Date.now();
local.timeElapsed = local.timeEnd - local.timeBegin;
setTimeout(function () {
// save data
local.fs.write(local.fileJson, JSON.stringify({
args: local.system.args,
aHrefList: local.page.evaluate(function () {
return Array.prototype.slice
.call(document.querySelectorAll('a'))
.map(function (element) {
return element.href;
});
}),
fileJson: local.fileJson,
fileScreenCapture: local.fileScreenCapture,
page: local.page,
timeBegin: local.timeBegin,
timeCreated: new Date(local.timeBegin).toISOString(),
timeEnd: local.timeEnd,
timeElapsed: local.timeElapsed,
error: error,
trace: trace,
url: local.url
}, null, 4));
// render page
local.page.render(local.fileScreenCapture);
console.log('created ' + local.fileJson);
console.log('created ' + local.fileScreenCapture);
self.phantom.exit();
}, 5000);
});
break;
}
}(this));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment