Skip to content

Instantly share code, notes, and snippets.

@junjiah
Last active December 22, 2015 23:49
Show Gist options
  • Save junjiah/6549375 to your computer and use it in GitHub Desktop.
Save junjiah/6549375 to your computer and use it in GitHub Desktop.
A brief tutorial to lxml module
from lxml import etree
# parse
parser = etree.XMLParser(ns_clean=True) # support other arguments
tree = etree.parse(some_file_like_obj)
# could be file name/path, file-like object, http/ftp url. but
# name/path and url are faster. or from string as following
tree = etree.fromstring(string, parser)
tree = etree.XML('<root><a><b/></a></root>')
# build a XML tree
root = etree.Element("root")
print root.tag # => root
root.append( etree.Element("child1") )
child2 = etree.SubElement(root, "child2")
child3 = etree.SubElement(root, "child3")
print etree.tostring(root, pretty_print=True)
# => <root>
# => <child1/>
# => <child2/>
# => <child3/>
# => </root>
# traverse the tree
print root[0].tag # => child1
print len(root) # => 3
root.insert(0, etree.Element("child0"))
for child in root: ... # traverse
root[0].getparent() # => root
root[1].getnext() # =>root[2]
for element in root.iter("child1", "child2"): ... # traverse specified tags
for element in root.iter(tag=etree.Entity): ... # and more: etree.Element..
# attributes
root = etree.Element("root", interesting="totally")
etree.tostring(root) # =>'<root interesting="totally"/>'
root.get("interesting") # => 'totally'
root.set("hello", "Huhu") # add an attribute
root.keys(), root.items(), root.attrib
# => (['interesting', 'hello'],
# => [('interesting', 'totally'), ('hello', 'Huhu')],
# => {'interesting': 'totally', 'hello': 'Huhu'})
# text, and tostring
root = etree.Element("root")
root.text = "TEXT" # => '<root>TEXT</root>'
root.tail = "TAIL" # => '<root>TEXT</root>TAIL'
etree.tostring(root, method="text") # no tag, other methods are 'html','xml'
etree.tostring(root, xml_declaration=True) # with declaration
etree.tostring(root, encoding="UTF-8") # modify encoding
# event-driven
for event, element in etree.iterparse('filepath',
events=['start','end','start-ns','end-ns']):
if element.tag == 'xxx': ...
if event == 'end': ...
# ElementPath
root.iterfind() # iterates over all Elements that match the path expression
root.find() # efficiently returns only the first match
root.findall() # returns a list of matching Elements
root.findtext() # returns the .text content of the first match
# an ugly way to remove namespace
f = open(some_file_like_obj)
tree = ET.XML(f.read().replace(' xmlns=', ' xmlnamespace='))
# map an XML tree into a dict of dicts (of text)
def recursive_dict(element):
return element.tag, \
dict(map(recursive_dict, element)) or element.text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment