Skip to content

Instantly share code, notes, and snippets.

@Je55eah
Last active June 7, 2017 21:40
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Je55eah/ccaad8fbca4525b4c360 to your computer and use it in GitHub Desktop.
Save Je55eah/ccaad8fbca4525b4c360 to your computer and use it in GitHub Desktop.
Website Crawler using PhantomJS
var targetAddress = 'http://www.autohotkey.com/board/';
var fileCount = 0;
phantom.onError = function(msg, trace) {
var msgStack = ['PHANTOM ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
});
}
//console.error(msgStack.join('\n'));
console.log(msgStack.join('\n'));
phantom.exit(1);
};
//phantom.injectJs('includes/URLutils.js'); // https://gist.github.com/Yaffle/1088850
phantom.injectJs('includes/URLutils.js');
var URL = URLUtils;
var browser = new Object(); // browser object
browser.name = 'browser';
browser.initPage = function () {
var webPage = require('webpage');
var page = webPage.create();
page.onConsoleMessage = function(msg) {
console.log(msg);
};
// page.onError = function (msg, trace) {
// var msgStack = ['PAGE ERROR: ' + msg];
// if (trace && trace.length) {
// msgStack.push('TRACE:');
// trace.forEach(function(t) {
// msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
// });
// }
// //console.error(msgStack.join('\n'));
// //console.log(msgStack.join('\n'));
// throw new Error(msgStack.join('\n'));
// //phantom.exit(1);
// };
page.viewportSize = { width: 1024, height: 1024 };
// dimensions = ""
// size = dimensions.split('*');
// page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
// : { format: dimensions, orientation: 'portrait', margin: '1cm' };
//console.log('1. page initialized');
return(page);
};
browser.page = {};
//browser.page = browser.initPage(); // first run is wasted... what to do about it?
browser.refreshPage = function() {
delete this.page;
//this.page = null;
this.page = this.initPage();
};
browser.processPage = function (address, mainHostname, callback) {
var page = this.page;
//console.log('processPage has started');
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load '+ address);
phantom.exit(1);
} else {
//var self = this;
window.setTimeout(function() {
var hrefs, title, output;
//console.log(page);
title = page.evaluate(function() {
//console.log(document.title);
return document.title;
});
title = fileCount++;
output = mainHostname + '\\' + title + ".pdf";
//console.log('5. rendering ' + output + '\n');
page.render(output); // save page to pdf using data from page for filename
page.injectJs('includes/jquery.js');
//console.log(page.injectJs('includes/jquery.js') ? "6. jQuery injected successfully." : "jQuery was not injected");
//console.log('7. collecting links');
hrefs = page.evaluate(function() {
var links = [];
$('a').each(function(index, value) {
//console.log($(this).attr('href'));
links.push(btoa($(this).attr('href')));
});
//console.log(links);
return links;
});
//this.refreshPage();
//console.log(hrefs);
callback(hrefs);
}, 5000);
}
});
//console.log('3. processPage has completed');
};
browser.load = function(address, mainHostname, callback, cbObject){
this.refreshPage();
//console.log(this.name);
//console.log(address, mainHostname, callback, cbObject);
this.processPage(address, mainHostname, callback.bind(cbObject));
};
var linkProcessor = new Object(); // controller object
linkProcessor.name = 'linkProcessor';
linkProcessor.mainURL = new URL(targetAddress.toLowerCase());
linkProcessor.mainHostname = linkProcessor.mainURL.hostname;
linkProcessor.url = linkProcessor.mainURL;
linkProcessor.linkList = [];
//linkProcessor.linkList = [linkProcessor.url.href]; // * add first page to the pageList, set savedStatus = false
linkProcessor.pdfList = [];
linkProcessor.newLinkHandler = {};
var newLinkHandler = linkProcessor.newLinkHandler;
//newLinkHandler.hrefs = [];
//newLinkHandler.ready = false;
newLinkHandler.name = 'newLinkHandler';
newLinkHandler.get = function () {
return(this.hrefs);
};
newLinkHandler.isReady = function () {
//console.log(this.name + '.isReady = ' + this.ready);
return(this.ready);
};
newLinkHandler.set = function (hrefsArray) {
this.hrefs = hrefsArray;
for(var i = 0, tot = hrefsArray.length; i < tot; ++i) {
this.hrefs.push(atob(hrefsArray[i]));
}
this.ready = true;
//console.log(this.hrefs);
//console.log('8. newLinkHandler has been set');
//console.log('8. newLinkHandler has been set' + "\n" + this.hrefs);
};
newLinkHandler.reset = function () {
this.hrefs = [];
this.ready = false;
//console.log('2. newLinkHandler reset');
};
linkProcessor.waitFor = function (testFx, onReady, timeOutMillis) { // https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 30000, //< Default Max Timout is 3s
start = new Date().getTime(),
condition = false,
interval = setInterval(function() {
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
// If not time-out yet and condition not yet fulfilled
//console.log('triggering testFx in ' + this.name + '.waitFor');
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if(!condition) {
// If condition still not fulfilled (timeout but condition is 'false')
console.log("time out error");
phantom.exit(1);
} else {
// Condition fulfilled (timeout and/or condition is 'true')
//console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
//console.log(this.name);
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
clearInterval(interval); //< Stop this interval
}
}
}, 3000); //< repeat check every 250ms
};
linkProcessor.resume = function(browser) {
var hrefs, link, n, count, url;
//console.log('resuming at ' + this.name + '.resume');
//console.log(this.url.hostname);
if(this.url.hostname === this.mainHostname){ // if the page is in the domain, add new hrefs to pageList
hrefs = this.newLinkHandler.hrefs;
for (var n=0, count=hrefs.length; n < count; ++n) {
// find javascript link destination - unimplemented
link = hrefs[n].toLowerCase();
//console.log('filing new link\n\n' + link + '\n');
//console.log("/");
if( link.charAt(0) === "/") { // fix relative link
//console.log('fixing new relative link\n\n' + link + '\n');
url = new URL(link, this.mainHostname);
link = url.href;
}
if (link.indexOf("http://") != 0 && link.indexOf("https://") != 0) {
//console.log(link.indexOf("http://"));
//console.log('inspecting\n\n' + link + '\n');
//console.log('falsifying new link\n\n' + link + '\n');
link = false; // discard for now, implement better handling later
}
if(link != false && this.linkList.indexOf(link) === -1 && this.pdfList.indexOf(link) === -1 && this.url.href !== link ) {
//console.log('pushing this new link into the linkList\n\n' + link + '\n');
this.linkList.push(link);
}
}
}
this.pdfList.push(this.url.href);
if(this.linkList.length > 0) {
this.url = new URL(this.linkList.pop());
this.start(browser);
} else {
console.log('finished');
phantom.exit();
}
};
linkProcessor.start = function(browser){
//console.log(this.name + ".start will attempt to fetch \n\n" + this.url.href + "\n");
console.log(fileCount + '.pdf, ' + this.url.href);
this.newLinkHandler.reset();
//console.log(this.url.href, this.mainHostname);
//browser.load(this.url.href, this.mainHostname, this.newLinkHandler.set.bind(this.newLinkHandler));
//console.log('=====');
//console.log(this.url.href, this.mainHostname, this.newLinkHandler.set, this.newLinkHandler);
//console.log('=====');
browser.load(this.url.href, this.mainHostname, this.newLinkHandler.set, this.newLinkHandler);
//this.waitFor(this.newLinkHandler.isReady, this.resume);
this.waitFor(this.newLinkHandler.isReady.bind(this.newLinkHandler), this.resume.bind(this, browser));
//waitFor(this.newLinkHandler.isReady.bind(this), this.resume.bind(this));
//waitFor(function(){return(this.newLinkHandler.isReady);}, function(){this.resume});
//waitFor(function(){return(this.newLinkHandler.isReady);}, function(){this.resume});
};
(function initFilesystem(hostname) {
//console.log('preparing the filesystem');
var fs = require('fs');
var path = hostname + '/';
if(fs.isDirectory(path) == false) {
if(fs.makeDirectory(path)) {
//console.log( path + ' directory was created.');
} else {
console.log('error: ' + path + ' directory could not be created.');
}
} else {
//console.log( path + " directory already exists.");
};
})(linkProcessor.mainHostname);
//console.log('beginning');
console.log('file name, url address');
linkProcessor.start(browser);
@Je55eah
Copy link
Author

Je55eah commented May 17, 2015

As of revison 8, I make no guarantees, but the basic system is working and it will populate a directory with pdfs. Save the console output to a csv file to create a lookup table. Fixes and improvements are welcome. Thanks for your interest and time.

@Je55eah
Copy link
Author

Je55eah commented May 19, 2015

--========: Some notes :========--
http://www.crmarsh.com/phantomjs/

@Je55eah
Copy link
Author

Je55eah commented May 19, 2015

Next on the adjenda:
save targeted data rather than pdf page images.
clean up the code, optimize the convoluted callbacks and waits if possible.
make it more fault tolerant
handle the non-standard links, including javascript links
your ideas

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment