Created
April 16, 2015 03:23
-
-
Save kaizhu256/cbbf499fb7d6918060cf to your computer and use it in GitHub Desktop.
phantomjs web crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*jslint | |
bitwise: true, | |
browser: true, | |
maxerr: 8, | |
maxlen: 96, | |
node: true, | |
nomen: true, | |
stupid: true | |
*/ | |
(function (self) { | |
'use strict'; | |
var local; | |
// run shared js-env code | |
(function () { | |
// init local | |
local = {}; | |
local.modeJs = (function () { | |
try { | |
return self.phantom.version && | |
typeof require('webpage').create === 'function' && | |
'phantom'; | |
} catch (errorCaughtPhantom) { | |
try { | |
return module.exports && | |
typeof process.versions.node === 'string' && | |
typeof require('http').createServer === 'function' && | |
'node'; | |
} catch (errorCaughtNode) { | |
return typeof navigator.userAgent === 'string' && | |
typeof document.querySelector('body') === 'object' && | |
'browser'; | |
} | |
} | |
}()); | |
// init debug_print | |
local['debug_print'.replace('_p', 'P')] = function (arg) { | |
/* | |
this function will both print the arg to stderr and return it | |
*/ | |
// debug arguments | |
local['_debug_printArguments'.replace('_p', 'P')] = arguments; | |
console.error('\n\n\ndebug_print'.replace('_p', 'P')); | |
console.error.apply(console, arguments); | |
console.error(); | |
// return arg for inspection | |
return arg; | |
}; | |
local.timeoutDefault = 30000; | |
}()); | |
switch (local.modeJs) { | |
// run node js-env code | |
case 'node': | |
// require modules | |
local.child_process = require('child_process'); | |
local.fs = require('fs'); | |
local.path = require('path'); | |
local.repl = require('repl'); | |
local.url = require('url'); | |
// init local properties | |
local.onError = function (error) { | |
var keyList; | |
if (error) { | |
console.error(error.stack); | |
} | |
while (true) { | |
keyList = Object.keys(local.taskDict || {}).sort(); | |
// init npm_config_tasks_running_max | |
process.env.npm_config_tasks_running_max = | |
process.env.npm_config_tasks_running_max || 4; | |
if (local.tasksRunning >= | |
(Number(process.env.npm_config_tasks_running_max) || 0) || | |
keyList.length === 0) { | |
break; | |
} | |
// init urlVisitedDict | |
local.urlVisitedDict = local.urlVisitedDict || {}; | |
if (!local.urlVisitedDict[keyList[0]] && | |
!local.fs.existsSync(encodeURIComponent(keyList[0]) + '.json')) { | |
local.urlVisitedDict[keyList[0]] = true; | |
local.taskDict[keyList[0]](); | |
} | |
delete local.taskDict[keyList[0]]; | |
} | |
if (local.tasksRunning === 0) { | |
console.log('tasks finished'); | |
process.exit(); | |
} | |
}; | |
// start repl | |
global.local = local; | |
local.repl.start({ useGlobal: true }); | |
local.processSpawn = function (arg0, argList) { | |
var childProcess, modeNext, onNext, url0; | |
modeNext = 0; | |
onNext = function (error, data) { | |
/* jslint-indent-begin 16 */ | |
/*jslint maxlen: 112*/ | |
modeNext = error instanceof Error | |
? NaN | |
: modeNext + 1; | |
try { | |
switch (modeNext) { | |
case 1: | |
arg0 = arg0 || process.argv[2]; | |
argList = argList || process.argv.slice(3); | |
url0 = local._debugUrl0 = argList[0]; | |
// init and increment tasksRunning | |
local.tasksRunning = local.tasksRunning || 0; | |
local.tasksRunning += 1; | |
childProcess = local.child_process.spawn( | |
arg0, | |
[__filename].concat(argList), | |
{ stdio: ['ignore', 1, 2] } | |
) | |
.on('error', local.onError) | |
.on('exit', onNext); | |
// init timerTimeout | |
childProcess.timerTimeout = local.child_process.spawn('/bin/sh', [ | |
'-c', | |
'sleep ' + | |
// coerce to finite integer | |
((0.001 * local.timeoutDefault) | 0) + | |
'; kill -9 ' + childProcess.pid + ' 2>/dev/null' | |
], { stdio: 'ignore' }).unref(); | |
break; | |
case 2: | |
local.tasksRunning -= 1; | |
data = local._debugData = JSON.parse( | |
local.fs.readFileSync(encodeURIComponent(url0) + '.json') | |
); | |
// init taskDict | |
local.taskDict = local.taskDict || {}; | |
data.aHrefList.forEach(function (url) { | |
// init npm_config_url_depth | |
process.env.npm_config_url_depth = process.env.npm_config_url_depth || 1; | |
if (!(/[\?\#]/).test(url) && | |
!(local.urlVisitedDict || {})[url] && | |
url.split('/').length <= Number(process.env.npm_config_url_depth) + 3 && | |
local.url.parse(url).host === local.url.parse(url0).host) { | |
local.taskDict[url] = local.taskDict[url] || function () { | |
local.processSpawn(arg0, [url].concat(argList.slice(1))); | |
}; | |
} | |
}); | |
onNext(); | |
break; | |
default: | |
local.onError(error); | |
} | |
} catch (errorCaught) { | |
local.onError(error); | |
} | |
/* jslint-indent-end */ | |
}; | |
onNext(); | |
}; | |
local.processSpawn(); | |
break; | |
// run phantom js-env code | |
case 'phantom': | |
// require modules | |
local.fs = require('fs'); | |
local.system = require('system'); | |
local.webpage = require('webpage'); | |
// init url | |
local.url = local.system.args[1]; | |
// init file | |
local.fileScreenCapture = encodeURIComponent(local.url) + '.png'; | |
local.fileJson = encodeURIComponent(local.url) + '.json'; | |
// init webpage | |
local.page = local.webpage.create(); | |
// init webpage clipRect | |
local.page.clipRect = { height: 768, left: 0, top: 0, width: 1024 }; | |
// init webpage viewportSize | |
local.page.viewportSize = { height: 768, width: 1024 }; | |
// init webpage error handling - http://phantomjs.org/api/webpage/handler/on-error.html | |
local.page.onError = local.onError; | |
// pipe webpage console.log to stderr | |
local.page.onConsoleMessage = function () { | |
console.error.apply( | |
console, | |
[local.url].concat(Array.prototype.slice.call(arguments)) | |
); | |
}; | |
// init userAgent | |
local.page.settings.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)' + | |
'AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2'; | |
// init time | |
local.timeBegin = Date.now(); | |
// open requested webpage | |
local.page.open(local.url, function (error, trace) { | |
local.timeEnd = Date.now(); | |
local.timeElapsed = local.timeEnd - local.timeBegin; | |
setTimeout(function () { | |
// save data | |
local.fs.write(local.fileJson, JSON.stringify({ | |
args: local.system.args, | |
aHrefList: local.page.evaluate(function () { | |
return Array.prototype.slice | |
.call(document.querySelectorAll('a')) | |
.map(function (element) { | |
return element.href; | |
}); | |
}), | |
fileJson: local.fileJson, | |
fileScreenCapture: local.fileScreenCapture, | |
page: local.page, | |
timeBegin: local.timeBegin, | |
timeCreated: new Date(local.timeBegin).toISOString(), | |
timeEnd: local.timeEnd, | |
timeElapsed: local.timeElapsed, | |
error: error, | |
trace: trace, | |
url: local.url | |
}, null, 4)); | |
// render page | |
local.page.render(local.fileScreenCapture); | |
console.log('created ' + local.fileJson); | |
console.log('created ' + local.fileScreenCapture); | |
self.phantom.exit(); | |
}, 5000); | |
}); | |
break; | |
} | |
}(this)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment