Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
BeautifulSoup4 TreeNode traverse and fix NavigableString
from bs4 import BeautifulSoup as bs
from bs4.element import Tag, NavigableString, Comment
# refer to / https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigablestring
ignore_list = ["", " ", "\n"]
def traverse(node):
if isinstance(node, Tag):
into_child(node)
elif isinstance(node, bs):
into_child(node)
elif isinstance(node, NavigableString):
text = node.string
if not text in ignore_list:
# print(node.string)
node.string.replace_with(text + " wawan!")
elif isinstance(node, Comment):
# nothing
pass
else:
# nothing
pass
def into_child(node):
for child in node.children:
traverse(child)
my_html = """<html>
<head><title>Sample Title</title></head>
<body>
<h1>Test h1</h1>
<p>My favorite animal is dog.</p>
<p>Wanwan love.</p>
<p>That is all.</p>
</body></html>
"""
soup = bs(my_html, 'html.parser')
traverse(soup)
print(soup)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment