-
-
Save shuiRong/fed71e6ee5439aec16ba17634ab6b4e4 to your computer and use it in GitHub Desktop.
Extract all meaningful text conten from page/html/document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* 从页面中提取文本,为什么不用 innerText/textContent ? | |
* 因为有些内容在 script/noscript/style 中,这些字符用户看不到,不能当成有意义字符。 | |
*/ | |
export const extractTextFromHTML = (window?: any) => { | |
if (!window) return '' | |
try { | |
const treeWalker = window.document.createTreeWalker( | |
window.document.body, | |
window.NodeFilter.SHOW_TEXT | |
); | |
const nodeList = []; | |
let currentNode; | |
while ((currentNode = treeWalker.nextNode())) { | |
nodeList.push(currentNode); | |
} | |
/** | |
* 过滤文本节点 | |
* 1. 父节点不能是 script/noscript/style | |
* 2. 节点内容不能是空白字符(正则检测) | |
*/ | |
const pageContent = nodeList.filter(node => { | |
const parentElement = node.parentElement | |
// 没有节点的话,不要。 | |
if (!parentElement || !node || !node.textContent) { | |
return false; | |
} | |
// 如果是情况1,不要。 | |
if (['script', 'style', 'noscript'].includes(parentElement.tagName.toLowerCase())) { | |
return false | |
} | |
const trimText = node.textContent.replace(/\s/g, '') | |
// 如果是情况2,不要。 | |
if (!trimText) { | |
return false | |
} | |
return true | |
}).map(node => node?.textContent?.replace(/\s/g, '')).join('') | |
return pageContent | |
} catch (error) { | |
console.error(`extractTextFromHTML error: ${error}`) | |
return '' | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment