Skip to content

Instantly share code, notes, and snippets.

@uni8inu
Last active December 27, 2016 11:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uni8inu/67fd5c7360b818c97a01020d6f52d427 to your computer and use it in GitHub Desktop.
Save uni8inu/67fd5c7360b818c97a01020d6f52d427 to your computer and use it in GitHub Desktop.
BeautifulSoup4 TreeNode traverse and fix NavigableString
from bs4 import BeautifulSoup as bs
from bs4.element import Tag, NavigableString, Comment
# refer to / https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigablestring
ignore_list = ["", " ", "\n"]
def traverse(node):
if isinstance(node, Tag):
into_child(node)
elif isinstance(node, bs):
into_child(node)
elif isinstance(node, NavigableString):
text = node.string
if not text in ignore_list:
# print(node.string)
node.string.replace_with(text + " wawan!")
elif isinstance(node, Comment):
# nothing
pass
else:
# nothing
pass
def into_child(node):
for child in node.children:
traverse(child)
my_html = """<html>
<head><title>Sample Title</title></head>
<body>
<h1>Test h1</h1>
<p>My favorite animal is dog.</p>
<p>Wanwan love.</p>
<p>That is all.</p>
</body></html>
"""
soup = bs(my_html, 'html.parser')
traverse(soup)
print(soup)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment