Skip to content

Instantly share code, notes, and snippets.

@sunfkny
Created January 17, 2024 06:39
Show Gist options
  • Save sunfkny/f43dc3b6cbd22b85aebff026b33ebed9 to your computer and use it in GitHub Desktop.
Save sunfkny/f43dc3b6cbd22b85aebff026b33ebed9 to your computer and use it in GitHub Desktop.
get html textContent with img alt text
import io
from bs4 import BeautifulSoup, PageElement, Tag
def traverse(
content: io.StringIO,
node: Tag | PageElement | None,
alt_replacement: str | None,
) -> None:
"""Traverse DOM Tree"""
if node is None:
return
if isinstance(node, Tag):
if node.name.lower() == "img":
alt_text = alt_replacement or node.get("alt", "")
if alt_text:
content.write(f"[{alt_text}]")
else:
for node in node.contents:
traverse(content=content, node=node, alt_replacement=alt_replacement)
else:
content.write(node.text)
def get_content_with_alt_text(
html_string: str,
alt_replacement: str | None = None,
) -> str:
"""
Example:
>>> html_string = '<div><p>This is<!-- Comment --><img src="image.jpg" alt="an test image">a test.</p></div>'
>>> get_content_with_alt_text(html_string=html_string, alt_replacement='IMAGE')
'This is[IMAGE]a test.'
"""
soup = BeautifulSoup(html_string, "html.parser")
content = io.StringIO()
for node in soup.contents:
traverse(content=content, node=node, alt_replacement=alt_replacement)
return content.getvalue()
/**
* Traverse DOM Tree
* @param {string[]} content
* @param {Element|ChildNode} node
* @param {string|undefined} altReplacement
* @returns {void}
*/
function traverse(content, node, altReplacement) {
if (node.nodeType === Node.TEXT_NODE) {
content.push(node.textContent ?? '');
} else if (node.nodeType === Node.ELEMENT_NODE) {
if ('tagName' in node && node.tagName.toLowerCase() === 'img') {
const altText = altReplacement ?? node.getAttribute('alt');
if (altText) {
content.push(`[${altText}]`);
}
} else if ('tagName' in node && node.tagName.toLowerCase() === 'script') {
// Skip for script tags
return;
} else {
for (const childNode of node.childNodes) {
traverse(content, childNode, altReplacement);
}
}
}
}
/**
* @param {string} htmlString
* @param {string|undefined} altReplacement
* @returns {string}
* @example
* ```
* const htmlString = '<div><p>This is<!-- Comment --><img src="image.jpg" alt="an test image">a test.</p></div>'
* console.log(getContentWithAltText(htmlString, 'IMAGE'))
* ```
*/
function getContentWithAltText(htmlString, altReplacement) {
let content = [];
const parser = new DOMParser();
const doc = parser.parseFromString(htmlString, 'text/html');
traverse(content, doc.body, altReplacement);
return content.join('');
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment