ram0973/lxml_parse_html_tree.py

## lxml_parse_html_tree.py
from lxml import etree
from io import StringIO

html = """"Hello, world!"<span class="black">
<div class="c1">division
    <p>"Hello - this is me.
    (c) passage in division"
    <b>"bold in passage "</b>
    </p>
        My phone:
    (+7) 999-999-99-99
</div>
<!-- Comment -->
<pre>It's a pre.</pre>
"""

def parse_HTML(html):
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(html), parser)

    for elem in tree.getiterator():
        # skip comments, their type == class 'cython_function_or_method'
        if type(elem.tag) is not str:
            continue
        if elem.text is None:
            text = ''
        else:
            text = elem.text
            elem.text += '!'
        if elem.tail is not None:
            print(elem.tail)
        print(elem.tag + " => " + text)
    return tree

if __name__ == "__main__":
    html_tree = parse_HTML(html)
    print(etree.tostring(html_tree, encoding=str, method='text'))
	from lxml import etree
	from io import StringIO

	html = """"Hello, world!"<span class="black">
	<div class="c1">division
	<p>"Hello - this is me.
	(c) passage in division"
	<b>"bold in passage "</b>
	</p>
	My phone:
	(+7) 999-999-99-99
	</div>
	<!-- Comment -->
	<pre>It's a pre.</pre>
	"""

	def parse_HTML(html):
	parser = etree.HTMLParser()
	tree = etree.parse(StringIO(html), parser)

	for elem in tree.getiterator():
	# skip comments, their type == class 'cython_function_or_method'
	if type(elem.tag) is not str:
	continue
	if elem.text is None:
	text = ''
	else:
	text = elem.text
	elem.text += '!'
	if elem.tail is not None:
	print(elem.tail)
	print(elem.tag + " => " + text)
	return tree

	if __name__ == "__main__":
	html_tree = parse_HTML(html)
	print(etree.tostring(html_tree, encoding=str, method='text'))