Skip to content

Instantly share code, notes, and snippets.

@ram0973
Created May 2, 2018 07:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ram0973/37d9a9013e1eacfdbd395aced19a6380 to your computer and use it in GitHub Desktop.
Save ram0973/37d9a9013e1eacfdbd395aced19a6380 to your computer and use it in GitHub Desktop.
LXML tree iteration example
from lxml import etree
from io import StringIO
html = """"Hello, world!"<span class="black">
<div class="c1">division
<p>"Hello - this is me.
(c) passage in division"
<b>"bold in passage "</b>
</p>
My phone:
(+7) 999-999-99-99
</div>
<!-- Comment -->
<pre>It's a pre.</pre>
"""
def parse_HTML(html):
parser = etree.HTMLParser()
tree = etree.parse(StringIO(html), parser)
for elem in tree.getiterator():
# skip comments, their type == class 'cython_function_or_method'
if type(elem.tag) is not str:
continue
if elem.text is None:
text = ''
else:
text = elem.text
elem.text += '!'
if elem.tail is not None:
print(elem.tail)
print(elem.tag + " => " + text)
return tree
if __name__ == "__main__":
html_tree = parse_HTML(html)
print(etree.tostring(html_tree, encoding=str, method='text'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment