Skip to content

Instantly share code, notes, and snippets.

@blahah
Created May 9, 2014 09:29
Show Gist options
  • Save blahah/70cf789e5f37c06ae1bf to your computer and use it in GitHub Desktop.
Save blahah/70cf789e5f37c06ae1bf to your computer and use it in GitHub Desktop.
Baisc proof of principle: scraping academic sites by emulated browsing in Node.js with Phantom, Casper, Spooky and JSDom
#! /usr/bin/exec node
var Spooky = require('spooky');
var dom = require('jsdom').jsdom;
var xpath = require('jsdom-xpath');
// Set up the spooky agent to retrieve URLs
var spooky = new Spooky({
child: {
transport: 'http'
},
casper: {
logLevel: 'debug',
verbose: true
}
}, function (err) {
if (err) {
e = new Error('Failed to initialize SpookyJS');
e.details = err;
throw e;
}
spooky.start("https://peerj.com/articles/384");
spooky.then(function() {
this.emit('pagedownload', this.evaluate(function() {
return document.all[0].outerHTML;
}));
});
spooky.run();
});
spooky.on('pagedownload', function(html) {
var doc = new dom(html);
var doi = xpath("//meta[@name='citation_doi']", doc)[0];
console.log(doi.content);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment