Skip to content

Instantly share code, notes, and snippets.

@josephwegner
Created July 27, 2012 22:03
Show Gist options
  • Save josephwegner/3190686 to your computer and use it in GitHub Desktop.
Save josephwegner/3190686 to your computer and use it in GitHub Desktop.
Scrape with PhantomJS
var page = require('webpage').create();
if(phantom.args.length < 1) {
phantom.exit();
} else {
var address = phantom.args[0];
page.onError = function() { };
page.open(address, function(status) {
if(status === "success" && page.content.length > 500) {
var href = page.evaluate(function() {
var imgs = document.getElementsByTagName("img");
for(var i=0,max=imgs.length; i<max; i++) {
var img = imgs[i];
img.setAttribute('height', img.height);
img.setAttribute('width', img.width);
img.setAttribute('src', img.src);
}
return window.location.href;
});
var content = {};
content[href] = page.content
console.log(JSON.stringify(content));
phantom.exit();
} else {
phantom.exit();
}
});
setTimeout(function() {
console.log(JSON.stringify({error: "Timed out"}));
phantom.exit();
}, 15000);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment