Skip to content

Instantly share code, notes, and snippets.

@franperezlopez
Last active December 4, 2018 05:19
Show Gist options
  • Save franperezlopez/a5d2d6c9b99d501ea4195a36928f7c0c to your computer and use it in GitHub Desktop.
Save franperezlopez/a5d2d6c9b99d501ea4195a36928f7c0c to your computer and use it in GitHub Desktop.
nightmare + vo
const Nightmare = require ("nightmare");
const vo = require("vo");
function scrape (url: string, dataScraper: Function, urlScraper: Function,
injectJQuery: boolean = true, validate: Function = null)
{
var item : any = null, urls: any = null;
// do not show window, do not load images (reduce time to ready())
var nightmare = Nightmare({show: false, pollInterval: 800, webPreferences: {images: false}});
function * workflow () {
// load url in Electron instance
yield nightmare.goto(url);
if (injectJQuery)
// injects jQuery library into loaded page (inserts script tag inside DOM)
yield nightmare.inject("js", "client\\jquery.js");
if (dataScraper != null)
// evaluate javascript function into loaded page (client side)
item = yield nightmare.evaluate(dataScraper);
if (urlScraper != null)
// executes url workflow and returns result
urls = yield vo(urlScraper(nightmare));
// destroys Electron instance
yield nightmare.end();
}
return vo(workflow)
.catch((error) => {console.log(error);})
.then (() => {
if (item != null && validate != null)
validate(item);
});
}
function * urlScraper (nightmare) {
var urls = [];
var nextPage: boolean = true;
while (nextPage) {
// evaluates javascript function into loaded page (client side)
// tip: do not return jQuery objects to the node.js side
var urlsInPage = yield nightmare
.evaluate (function () { return jQuery.map($(".listing a"), (d) => d.href); });
urls = urls.concat(urlsInPage);
// evaluates javascript function into loaded page (client side)
nextPage = yield nightmare.evaluate(function () {return $(".pagination a.next").length > 0;});
if (nextPage)
// triggers click event into loaded page to load next page
yield nightmare.click(".pagination a.next");
}
return urls;
}
scrape("http://anywebyouwanttoscrape.com", Function("return $(.email).text();"), urlScraper, true, null);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment