Last active
December 27, 2016 11:02
-
-
Save uni8inu/67fd5c7360b818c97a01020d6f52d427 to your computer and use it in GitHub Desktop.
BeautifulSoup4 TreeNode traverse and fix NavigableString
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bs | |
from bs4.element import Tag, NavigableString, Comment | |
# refer to / https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigablestring | |
ignore_list = ["", " ", "\n"] | |
def traverse(node): | |
if isinstance(node, Tag): | |
into_child(node) | |
elif isinstance(node, bs): | |
into_child(node) | |
elif isinstance(node, NavigableString): | |
text = node.string | |
if not text in ignore_list: | |
# print(node.string) | |
node.string.replace_with(text + " wawan!") | |
elif isinstance(node, Comment): | |
# nothing | |
pass | |
else: | |
# nothing | |
pass | |
def into_child(node): | |
for child in node.children: | |
traverse(child) | |
my_html = """<html> | |
<head><title>Sample Title</title></head> | |
<body> | |
<h1>Test h1</h1> | |
<p>My favorite animal is dog.</p> | |
<p>Wanwan love.</p> | |
<p>That is all.</p> | |
</body></html> | |
""" | |
soup = bs(my_html, 'html.parser') | |
traverse(soup) | |
print(soup) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment