Skip to content

Instantly share code, notes, and snippets.

@sqrtsanta
Created February 27, 2017 13:03
Show Gist options
  • Save sqrtsanta/d13d7a93bf909b8ba32c8e3b28ae58f1 to your computer and use it in GitHub Desktop.
Save sqrtsanta/d13d7a93bf909b8ba32c8e3b28ae58f1 to your computer and use it in GitHub Desktop.
Scraper based on Horseman, written in functional way with Ramda/Folktale.Task
/* global $ document */
const Horseman = require('node-horseman');
const cheerio = require('cheerio');
const R = require('ramda');
const phantomjs = require('phantomjs-prebuilt');
const { Task, mapf } = require('../utils/f.js');
// fetch :: string -> Task Error HTML
const fetch = (url, commands = {}) => {
return new Task((reject, result) => {
const horseman = new Horseman({ phantomPath: phantomjs.path });
horseman
.viewport(1366, 768)
.userAgent('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36') // eslint-disable-line
.open(url)
.evaluate((selector) => {
if (selector) {
$(selector).click();
}
}, commands.click)
.waitFor((selector) => {
if (!selector) {
return true;
}
return $(selector).length ? $(selector).is(':hidden') : true;
}, commands.click, true)
.evaluate(() => document.documentElement.outerHTML)
.catch(error => reject(error))
.then(body => result(body))
.close();
});
};
// window :: HTML -> Task Error $
const window = HTML => cheerio.load(HTML);
// dom :: string -> Task Error $
const dom = R.compose(mapf(window), fetch);
// sel :: string -> $ -> [DOMNode]
const sel = R.curry((selector, $) => $(selector));
// absolutize :: string -> string -> string
const absolutize = R.curry((root, href) => {
return href.startsWith('http') ? href : `${root}${href}`;
});
// toArray :: {} -> []
const toArray = obj => obj.toArray();
// text :: $ -> DOMNode -> string
const text = $ => node => $(node).text();
// attr :: string -> $ -> DOMNode -> string
const attr = attrubute => $ => node => $(node).attr(attrubute);
// selattr -> string -> string -> ($ -> string)
const selattr = (selector, attribute) => R.converge(R.call, [
attr(attribute),
sel(selector),
]);
// selarrattr -> string -> string -> ($ -> [string])
const selarrattr = (selector, attribute) => R.converge(R.call, [
R.compose(R.map, attr(attribute)),
R.compose(toArray, sel(selector)),
]);
// seltext -> string -> ($ -> string)
const seltext = selector => R.converge(R.call, [
text,
sel(selector),
]);
// selarrtext -> string -> ($ -> [string])
const selarrtext = selector => R.converge(R.call, [
R.compose(R.map, text),
R.compose(toArray, sel(selector)),
]);
module.exports = {
fetch,
window,
dom,
selattr,
selarrattr,
seltext,
selarrtext,
absolutize,
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment