Skip to content

Instantly share code, notes, and snippets.

@shariq
Last active August 18, 2020 00:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save shariq/f74307c76b4b7bcb6ce6a58f2e09e861 to your computer and use it in GitHub Desktop.
Save shariq/f74307c76b4b7bcb6ce6a58f2e09e861 to your computer and use it in GitHub Desktop.
getArticleText
// fancily gets the text content of an html page
function getArticleText() {
function textNodesUnder(node){
var all = [];
for (node=node.firstChild;node;node=node.nextSibling){
if (node.nodeType==3) all.push(node);
else all = all.concat(textNodesUnder(node));
}
return all;
}
let textNodes = textNodesUnder(document);
textNodes = textNodes.filter(t => t.parentElement.tagName !== 'STYLE' && t.parentElement.tagName !== 'SCRIPT' && window.getComputedStyle(t.parentElement, null).getPropertyValue('visibility') === 'visible');
let textByStyle = {};
for (let i = 0; i < textNodes.length; i++) {
let style = window.getComputedStyle(textNodes[i].parentElement, null).getPropertyValue('font-size') + window.getComputedStyle(textNodes[i].parentElement, null).getPropertyValue('font-family') + window.getComputedStyle(textNodes[i].parentElement, null).getPropertyValue('font-weight');
if (!textByStyle[style]) {
textByStyle[style] = '';
}
textByStyle[style] = textByStyle[style].trim() + ' ' + textNodes[i].textContent.trim();
}
let maxLength = -1;
let bestText = '';
for (let style of Object.keys(textByStyle)) {
let length = textByStyle[style].length;
if (length > maxLength) {
maxLength = length;
bestText = textByStyle[style];
}
}
return bestText;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment