Last active November 6, 2022 09:18
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at */
Pull structured content out of the DOM.
- Hero images
- Title
- Summary
- Site name
- Article content
Things we can use:
- `<title>`
- meta description
- Twitter card meta tags
- Facebook Open Graph tags
- Win8 Tile meta tags
- meta description
- Search snippet things like
- microformats
// Transducers
// -----------------------------------------------------------------------------
// Box a value, marking it reduced.
const reduced = (value) => ({value: value, reduced: reduced});
const isReduced = (x) => x && x.reduced === reduced;
// Unwrap a value if it is boxed with a given tag.
const unbox = (x) => x.value;
// A special reduce function that works with any indexed collection and allows
// us to return early by boxing value with `reduced`.
const reduce = (indexed, step, result) => {
for (var i = 0; i < indexed.length; i++) {
result = step(result, indexed[i]);
// If value was reduced early, take the fast path and end reduction.
if (isReduced(result)) return unbox(result);
return result;
// Transform elements using a transducer transform function.
// See
const transduce = (xf, indexed, step, result) =>
// The transform function transforms the `step` function before reduction,
// which allows you to transform items in the list before allocating array
// space for them.
reduce(indexed, xf(step), result);
// Add something to the end of an array.
const append = (array, x) => {
// Only append if value is not null.
if (x != null) array.push(x);
// Return array. This is useful when writing terse functions for `reduce`.
return array;
// Transform items with transducer function, then collect them in an array.
const into = (xf, indexed) => transduce(xf, indexed, append, []);
// Mapping transducer function.
// Transforms the `step` function so that any `input` is first passed through
// function `a2b`.
const map = (a2b) => (step) => (result, input) =>
step(result, a2b(input));
// Filtering transducer function.
// Transforms the `step` function so that it ignores any `input` that does not
// pass the `predicate` function.
const filter = (predicate) => (step) => (result, input) =>
predicate(input) ? step(result, input) : result;
// Just like `filter` above, but only keeps things that fail `predicate` test.
const reject = (predicate) => (step) => (result, input) =>
!predicate(input) ? step(result, input) : result;
// Transducer function to take first `n` values, then stop reduction.
// Returns `xform` function.
const take = (n) => (step) => (result, input) => {
if (n > 0) {
n = n - 1;
return step(result, input);
} else {
// Once we reach 0 on our counter, stop reduction.
return reduced(result);
// Compose 2 functions.
const comp2 = (x, y) => (v) => x(y(v));
// Compose n functions.
const comp = (f, ...fns) => reduce(fns, comp2, f);
// Utils
// -----------------------------------------------------------------------------
const getText = (el) => el.textContent;
const getContent = (metaEl) => metaEl.content;
const getSrc = (imgEl) => imgEl.src;
// Construct a sequence of fallbacks. Each function in `fns` is called in turn
// with `x` has a chance to return a value. If that value is `null`, the next
// function is called, and so-on until we have a value.
// If all return `null`, then `fallback` is used.
// @TODO determine if I should instead build functions that collect all possible
// matches as an array. In this case, `queries` would return an array of 0 or
// more. I guess that would let me score the list, rather than picking a "best"
// one by order.
const any = (...fns) => (x, fallback) => reduce(fns, (fallback, f) => {
const result = f(x);
return result != null ? reduced(result) : fallback;
}, fallback);
const id = (x) => x;
// Create a function that will query the first available match for
// `selector` in `pageEl`. `a2b` function transforms the result on the way out.
// We use this with `fallbacks` to easily construct page crawlers with a
// sequence of fallbacks, below.
const queries = (selector, a2b) => (pageEl) => {
a2b = a2b || id;
const result = pageEl.querySelector(selector);
return result != null ? a2b(result) : null;
// Does element match a particular tag name?
const matchesTag = (el, pattern) => !== -1;
const matchesClass = (el, pattern) => !== -1;
// Scraping and content scoring helpers
// -----------------------------------------------------------------------------
// Score the content-y-ness of a string. Note that this is an imperfect score
// and you'll be better off if you combine it with other heuristics like
// element classname, etc.
const scoreContentyness = (text) => {
// If paragraph is less than 25 characters, don't count it.
if (text.length < 25) return 0;
// Ok, we've weeded out the no-good cases. Start score at one.
var score = 1;
// Add points for any commas within.
score = score + text.split(',').length;
// For every 100 characters in this paragraph, add another point.
// Up to 3 points.
score = score + Math.min(Math.floor(text.length / 100), 3);
return score;
// Score a child element to find out how "content-y" it is.
// A score is determined by things like number of commas, etc.
// Maybe eventually link density.
const scoreElContentyness = (el) => {
return scoreContentyness(getText(el));
const isSufficientlyContenty = (el) => {
return scoreElContentyness(el) > 3;
const UNLIKELY_CONTENT_CLASSNAMES = /date|social|community|remark|discuss|disqus|e[\-]?mail|rss|print|extra|share|login|sign|reply|combx|comment|com-|contact|header|menu|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i;
const isUnlikelyCandidate = (el) => matchesClass(el, UNLIKELY_CONTENT_CLASSNAMES);
const countWords = (text) => text.split(/\s/).length;
// Is text long enough to be content?
const isSufficientlyLong = (text) => text.length > 25;
const isTextSufficientlyLong = comp(isSufficientlyLong, getText);
const getElTextLength = comp((x) => x.length, getText);
const sum = (a, b) => a + b;
// Calculat the density of links in content.
const calcLinkDensity = (el) => {
const linkLength =
transduce(map(getElTextLength), el.querySelectorAll('a'), sum, 0);
const textLength = getElTextLength(el);
return linkLength / textLength;
// Is the link density of this element high?
const isHighLinkDensity = (el) => calcLinkDensity(el) > 0.5;
// Extract a clean title from text that has been littered with separator
// garbage.
const cleanTitle = (text) => {
var title = text;
if (text.match(/\s[\|\-:]\s/)) {
title = text.replace(/(.*)[\|\-:] .*/gi, '$1');
if (countWords(title) < 3) {
title = text.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
// Fall back to title if word count is too short.
if (countWords(title) < 5) {
title = text;
// Trim spaces.
return title.trim();
// Content scrapers
// -----------------------------------------------------------------------------
// Scrape microformats `.entry-title`.
const scrapeMicroformatsTitle = (pageEl) => {
const titles = pageEl.querySelectorAll('.entry-title, .p-name');
// If we found more than one .entry-title, return null. We'll assume that this
// is a blog listing page.
return (titles.length === 1) ? getText(titles.length[0]) : null;
// Find a good title within page.
// Usage: `scrapeTitle(htmlEl, 'Untitled')`.
const scrapeTitle = any(
'meta[property="og:title"], meta[name="twitter:title"]',
// @TODO look at `[itemprop=headline]`
queries('h1, h2, h3', getText),
queries('title', comp(cleanTitle, getText))
// @TODO look at .entry-summary
const scrapeMicroformatsDescription = (pageEl) => {
const summaries = pageEl.querySelectorAll('.entry-summary, .p-summary');
// If we found more than one .entry-summary, return null. We'll assume that
// this is a blog listing page.
return (summaries.length === 1) ? getText(summaries.length[0]) : null;
const _concatToDescription = (summary, s) =>
summary.length > 250 ? reduced(summary) : summary + ' ' + s;
const scrapeDescriptionFromContent = (pageEl) => {
// Query for all paragraphs on the page.
// Trim down paragraphs to the ones we deem likely to be content.
// Then map to `textContent`.
const texts = into(comp(
// First, reject things that we know to be unlikely.
// Text content is long enough to be content.
), pageEl.querySelectorAll('p'));
// Return early if we haven't found anything good.
if (texts.length === 0) return null;
// Concat paragraph text together until we get more than 250 letters.
const summary = reduce(texts, _concatToDescription, '');
return summary;
// Find a good description for the page.
// Usage: `scrapeDescription(htmlEl, '')`.
const scrapeDescription = any(
// Prefer social media descriptions to `meta[name=description]` because they
// are curated for readers, not search bots.
'meta[property="og:description"], meta[name="twitter:description"]',
// @TODO process description to remove garbage from descriptions.
queries('meta[name=description]', getContent),
// @TODO look at `[itemprop=description]`
// You probably want to use the base URL as fallback.
const scrapeSiteName = any(
// Prefer the standard meta tag.
queries('meta[name="application-name"]', getContent),
queries('meta[property="og:site_name"]', getContent),
// Note that this one is an `@name`.
queries('meta[name="twitter:site"]', getContent)
const isImgSizeAtLeast = (imgEl, w, h) =>
imgEl.naturalWidth > w && imgEl.naturalHeight > h;
const isImgHeroSize = (imgEl) => isImgSizeAtLeast(imgEl, 600, 300);
// Collect Twitter image urls from meta tags.
// Returns an array of 1 or more Twitter img urls, or null.
// See
const queryTwitterImgUrls = (pageEl) => {
const metas = pageEl.querySelectorAll(`
// Returning different types is rather bad form, but works better for our
// `fallbacks` function.
return metas.length > 0 ? into(map(getContent), metas) : null;
// Collect Facebook Open Graph image meta tags.
// Returns an aray of 0 or more meta elements.
// These 2 meta tags are equivalent. If the first doesn't exist, look for
// the second.
// See
const queryOpenGraphImgUrls = queries(`
`, (meta) => [getContent(meta)]);
const findHeroImgUrls = (pageEl) => into(
comp(filter(isImgHeroSize), take(4), map(getSrc)),
// Scrape up to 4 featured images.
// We favor meta tags like `twitter:image` and `og:image` because those are
// hand-curated. If we don't them, we'll dig through the content ourselves.
// Returns an array of image urls.
// @TODO it might be better just to grab everything, then de-dupe URLs.
const scrapeHeroImgUrls = any(
// Note that Facebook OpenGraph image queries are kept seperate from Twitter
// image queries. This is to prevent duplicates when sites include both.
// If we find Twitter first, we'll return it and never look for Facebook.
// We'll favor Twitter image URLs, since there can be more than one.
// If we have 4 or more images, we show 4 images in combination.
// Otherwise, use the first featured image only.
const isImgCombo = (imgUrls) => imgUrls.length > 3;
// @TODO need some methods for scaling and cropping images.
