This counts up all the words (innerText) at each level of the dom tree.
// Procedure: | |
countWords(document.body); | |
/* | |
* Count the text inside each element. | |
* Parents include all the words of their children. | |
* Additionally, notes how much of the text comes "before" this node in depth-first dom order. | |
* | |
* The purpose of this was to find which subtrees of the dom have most of the text. | |
* | |
* The output looks like: | |
"div: 8264 (22.55%) inside. 433 (1.19%) before)": [ | |
"p: 292 (0.8%) inside. 433 (1.19%) before)": [ | |
"span: 72 (0.2%) inside. 433 (1.19%) before)": [] | |
"span: 119 (0.33%) inside. 505 (1.38%) before)": [] | |
"a: 21 (0.06%) inside. 624 (1.71%) before)": [ | |
"span: 21 (0.06%) inside. 624 (1.71%) before)": [] | |
"span: 77 (0.22%) inside. 645 (1.76%) before)": [] | |
*/ | |
function countWords(el, max = el.innerText.length, charsSeen = 0) { | |
if (shouldStop(el)) { | |
return; | |
} | |
const innerText = cleanText(el); | |
const len = innerText.length; | |
// If there aren't many letters, show them; otherwise show a count | |
const desc = (len > 0 && len < 42) | |
? `'${innerText}'` | |
: len; | |
// What % of total text does this node have under it? | |
const inside = toPercent(len/max); | |
const label = `${nameOf(el)}: ${desc} (${inside}%) inside. ${charsSeen} (${toPercent(charsSeen/max)}%) before)` | |
el.setAttribute('title', label); | |
const kidStats = []; | |
for (let kid of Array.from(el.children)) { | |
const stats = countWords(kid, max, charsSeen); // Recurse | |
if (!stats) { continue; } | |
kidStats.push(stats); | |
charsSeen += cleanText(kid).length; | |
} | |
return { [label]: kidStats }; | |
} | |
function nameOf(el) { | |
return `${el.tagName.toLowerCase()}`; | |
} | |
function shouldStop(el) { | |
if (el == null) { return true; } | |
// Is one of the non-content tags | |
const tagsWithNoContent = ['SCRIPT', 'IMG']; // TODO: save image paths | |
if (tagsWithNoContent.includes(el.tagName)) { return true; } | |
// Contains no text | |
if (el.innerText == null || el.innerText.length === 0) { return true; } | |
return false; | |
} | |
function cleanText(el = {}) { | |
return (el.innerText || '').trim(); | |
} | |
// Show a percentage rounded to two decimal places | |
function toPercent(decimal) { | |
return Math.ceil(decimal * 10000) / 100 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment