Skip to content

Instantly share code, notes, and snippets.

@mattpowell
Last active June 2, 2016 22:31
Show Gist options
  • Save mattpowell/533c90e43808ffc0756b to your computer and use it in GitHub Desktop.
Save mattpowell/533c90e43808ffc0756b to your computer and use it in GitHub Desktop.
Extracts info from a Cheerio-based DOM using a json object.

Usage

Code:

var DomExtractor = require('./index.js');
var html = '<h3 data-universe="marvel">List of Superheros:</h3><ul class="heros"><li data-id="1"><span>Spiderman<span></li><li data-id="2"><span>Captain America<span></li><ul>';
var query = {
      universe: 'h3[data-universe]',
      heros: {
        _: 'ul.heros li',
        name: 'span',
        id: '@[data-id]'
      }
    };
var results = (new DomExtractor(html)).extract(query);
console.log(results);

Output:

{ universe: 'marvel',
  heros:
   [ { name: 'Spiderman', id: '1' },
     { name: 'Captain America', id: '2' } ] }
var cheerio = require('cheerio');
var DomExtractor = module.exports = function(html) {
var $ = cheerio.load(html);
var extract = this.extract = function(q, root) {
var results = {};
for (var name in q) {
var selector = q[name];
if (typeof selector === 'string') {
if (name !== '_') {
var attr = (selector.match(/\[([^\]]+)\]$/) || [])[1],
useRoot = /^@/.test(selector),
node = useRoot ? $(root) : $(selector, root);
if (!node.length) {
node = $(root);
}
results[name] = attr ? node.attr(attr) : node.text();
}
} else {
results[name] = $(selector._, root).map(function(i, el) {
return extract(selector, el);
}).get();
}
}
return results;
}
this.find = function(selector) {
return $(selector);
};
}
{
"name": "DomExtractor",
"version": "1.0.0",
"description": "Extracts info from a Cheerio-based DOM using a json object.",
"author": "Matt Powell",
"dependencies": {
"cheerio": "0.18.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment