Skip to content

Instantly share code, notes, and snippets.

@mwunsch
Last active September 13, 2023 09:09
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mwunsch/4693383 to your computer and use it in GitHub Desktop.
Save mwunsch/4693383 to your computer and use it in GitHub Desktop.
Get the text nodes out of a document, ignoring the ones that are in Elements where the text value aren't likely to be valuable (like <script> tags) and nodes containing just whitespace.
function getLegitTextNodes() {
if (!document.createTreeWalker) return [];
var blacklist = ['SCRIPT', 'OPTION', 'TEXTAREA'],
textNodes = [],
walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
function excludeBlacklistedNodes(node) {
if (blacklist.indexOf(node.parentElement.nodeName.toUpperCase()) >= 0) return NodeFilter.FILTER_REJECT;
if (String.prototype.trim && !node.nodeValue.trim().length) return NodeFilter.FILTER_SKIP;
return NodeFilter.FILTER_ACCEPT;
},
false
);
while(walker.nextNode()) textNodes.push(walker.currentNode);
return textNodes;
}
// usage:
// getLegitTextNodes().forEach(function (node, i) { console.log(node.nodeValue) })
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment