ram0973/bs4_parse_html_tree.py

## bs4_parse_html_tree.py
from bs4 import BeautifulSoup, Comment
SKIP_TAGS = ['style', 'script', 'meta', 'code', 'pre']

html = """
"Hello, world!"<span class="black">
<div class="c1">division
    <p>"Hello - this is me.
    (c) passage in division"
    <b>"bold in passage "</b>
    </p>
        My phone:
    (+7) 999-999-99-99
</div>
<!-- Comment -->
<pre>It's a pre.</pre>
"""


def parse_HTML(html_text):
    if not html_text:
        return None
    soup = BeautifulSoup(html_text, 'html.parser')
    tags_texts = soup.findAll(string=lambda txt: not isinstance(txt, Comment))
    for tag_text in tags_texts:
        text = tag_text
        if not text or text == '\n' or tag_text.findParent().name in SKIP_TAGS:
            continue
        print(text)
    return soup.prettify()


if __name__ == "__main__":
    print(parse_HTML(html))
	from bs4 import BeautifulSoup, Comment
	SKIP_TAGS = ['style', 'script', 'meta', 'code', 'pre']

	html = """
	"Hello, world!"<span class="black">
	<div class="c1">division
	<p>"Hello - this is me.
	(c) passage in division"
	<b>"bold in passage "</b>
	</p>
	My phone:
	(+7) 999-999-99-99
	</div>
	<!-- Comment -->
	<pre>It's a pre.</pre>
	"""


	def parse_HTML(html_text):
	if not html_text:
	return None
	soup = BeautifulSoup(html_text, 'html.parser')
	tags_texts = soup.findAll(string=lambda txt: not isinstance(txt, Comment))
	for tag_text in tags_texts:
	text = tag_text
	if not text or text == '\n' or tag_text.findParent().name in SKIP_TAGS:
	continue
	print(text)
	return soup.prettify()


	if __name__ == "__main__":
	print(parse_HTML(html))