Skip to content

Instantly share code, notes, and snippets.

@KiaraGrouwstra
Last active October 11, 2020 16:49
Show Gist options
  • Save KiaraGrouwstra/0c46da6ad58c80c97e1ba7c3da2a73e9 to your computer and use it in GitHub Desktop.
Save KiaraGrouwstra/0c46da6ad58c80c97e1ba7c3da2a73e9 to your computer and use it in GitHub Desktop.
browser-dom-scraper
// scrape DOM elements into plain JS objects!
// browser-friendly no-dep version of https://github.com/dijs/parsz
// usage: parsley({title: 'title'})
const keyPattern = /^([\w-]+)(\?)?\(?([^)~]*)\)?~?\(?([^)]*)\)?$/;
const selectorPattern = /^([.-\s\w[\]=>#:()]+)?@?([\w-]+)?\s*\|?\s*(.*)?$/;
const IDENTITY_SELECTOR = ".";
const VOID_NAME = "--";
// http://2ality.com/2012/04/eval-variables.html
const evalExpr = (expr, o) => Function
.apply(null, [...Object.keys(o), `return ${expr}`])
.apply(null, Object.values(o));
function getMatch(selector, pattern) {
const matched = selector.match(pattern);
if (!matched) {
throw new Error(`Could not match pattern: ${selector}`);
}
return matched;
}
// parse a key string
function parseKey(key) {
const [, name, optional, selector, linkSelector] = getMatch(key, keyPattern);
return {
isOptional: !!optional,
isRemote: !!linkSelector,
isVoid: name.trim() === VOID_NAME,
linkSelector, name, selector,
};
}
// parse a value string
function parseSelector(str) {
const [, selector, attr, fn] = getMatch(str, selectorPattern);
return { attr, fn, selector };
}
// handle a parselet leaf (string)
function parseString({ el, transforms, isOptional, parselet }) {
const { selector, attr, fn } = parseSelector(parselet);
const item = el.querySelector(selector);
const data = item ? attr ? item.getAttribute(attr) : item.innerText.trim() : '';
if (!data && !isOptional) {
throw { selector, attr, fn, data };
}
return data && fn ? evalExpr(fn, transforms)(data) : data;
}
// handle a parselet item: string or object
const parseItem = (opts) => (typeof opts.parselet === 'string' ? parseString : parseObject)(opts);
// handle a parselet object
const parseObject = (opts) => Object.fromEntries(
Object.entries(opts.parselet)
.flatMap(([k, map]) => {
const { name, selector: sel, isOptional, isVoid } = parseKey(k);
const opt = Object.assign({}, opts, { parselet: map, isOptional: isOptional || opts.isOptional });
if (isVoid && map instanceof Object) {
return Object.entries(parseObject(Object.assign({}, opt, { el: opt.el.querySelector(sel) })));
}
const data = map instanceof Array ?
parseList(sel, opt) :
parseItem(opt);
return [[name, data]];
}));
// handle a parselet list, i.e. parse each selected node
function parseList(sel, opts) {
const opt = Object.assign({}, opts, { parselet: opts.parselet[0] });
const { el } = opt;
return Array.from(el.querySelectorAll(sel)).map((el) => parseItem(
Object.assign({}, opt, {el}),
));
}
// handle a parselet object
const parsley = (parselet, opts = {}) =>
parseObject(Object.assign({}, opts, { parselet, el: document }));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment