Last active
October 11, 2020 16:49
-
-
Save KiaraGrouwstra/0c46da6ad58c80c97e1ba7c3da2a73e9 to your computer and use it in GitHub Desktop.
browser-dom-scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// scrape DOM elements into plain JS objects! | |
// browser-friendly no-dep version of https://github.com/dijs/parsz | |
// usage: parsley({title: 'title'}) | |
const keyPattern = /^([\w-]+)(\?)?\(?([^)~]*)\)?~?\(?([^)]*)\)?$/; | |
const selectorPattern = /^([.-\s\w[\]=>#:()]+)?@?([\w-]+)?\s*\|?\s*(.*)?$/; | |
const IDENTITY_SELECTOR = "."; | |
const VOID_NAME = "--"; | |
// http://2ality.com/2012/04/eval-variables.html | |
const evalExpr = (expr, o) => Function | |
.apply(null, [...Object.keys(o), `return ${expr}`]) | |
.apply(null, Object.values(o)); | |
function getMatch(selector, pattern) { | |
const matched = selector.match(pattern); | |
if (!matched) { | |
throw new Error(`Could not match pattern: ${selector}`); | |
} | |
return matched; | |
} | |
// parse a key string | |
function parseKey(key) { | |
const [, name, optional, selector, linkSelector] = getMatch(key, keyPattern); | |
return { | |
isOptional: !!optional, | |
isRemote: !!linkSelector, | |
isVoid: name.trim() === VOID_NAME, | |
linkSelector, name, selector, | |
}; | |
} | |
// parse a value string | |
function parseSelector(str) { | |
const [, selector, attr, fn] = getMatch(str, selectorPattern); | |
return { attr, fn, selector }; | |
} | |
// handle a parselet leaf (string) | |
function parseString({ el, transforms, isOptional, parselet }) { | |
const { selector, attr, fn } = parseSelector(parselet); | |
const item = el.querySelector(selector); | |
const data = item ? attr ? item.getAttribute(attr) : item.innerText.trim() : ''; | |
if (!data && !isOptional) { | |
throw { selector, attr, fn, data }; | |
} | |
return data && fn ? evalExpr(fn, transforms)(data) : data; | |
} | |
// handle a parselet item: string or object | |
const parseItem = (opts) => (typeof opts.parselet === 'string' ? parseString : parseObject)(opts); | |
// handle a parselet object | |
const parseObject = (opts) => Object.fromEntries( | |
Object.entries(opts.parselet) | |
.flatMap(([k, map]) => { | |
const { name, selector: sel, isOptional, isVoid } = parseKey(k); | |
const opt = Object.assign({}, opts, { parselet: map, isOptional: isOptional || opts.isOptional }); | |
if (isVoid && map instanceof Object) { | |
return Object.entries(parseObject(Object.assign({}, opt, { el: opt.el.querySelector(sel) }))); | |
} | |
const data = map instanceof Array ? | |
parseList(sel, opt) : | |
parseItem(opt); | |
return [[name, data]]; | |
})); | |
// handle a parselet list, i.e. parse each selected node | |
function parseList(sel, opts) { | |
const opt = Object.assign({}, opts, { parselet: opts.parselet[0] }); | |
const { el } = opt; | |
return Array.from(el.querySelectorAll(sel)).map((el) => parseItem( | |
Object.assign({}, opt, {el}), | |
)); | |
} | |
// handle a parselet object | |
const parsley = (parselet, opts = {}) => | |
parseObject(Object.assign({}, opts, { parselet, el: document })); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment