junjiah/lxml_tutorial.py

## lxml_tutorial.py
from lxml import etree

# parse
parser = etree.XMLParser(ns_clean=True) # support other arguments
tree = etree.parse(some_file_like_obj)
# could be file name/path, file-like object, http/ftp url. but
# name/path and url are faster. or from string as following
tree = etree.fromstring(string, parser)
tree = etree.XML('<root><a><b/></a></root>')

# build a XML tree
root = etree.Element("root")
print root.tag # => root
root.append( etree.Element("child1") )
child2 = etree.SubElement(root, "child2")
child3 = etree.SubElement(root, "child3")
print etree.tostring(root, pretty_print=True)
# => <root>
# =>   <child1/>
# =>   <child2/>
# =>   <child3/>
# => </root>

# traverse the tree
print root[0].tag # => child1
print len(root) # => 3
root.insert(0, etree.Element("child0"))
for child in root: ... # traverse
root[0].getparent() # => root
root[1].getnext() # =>root[2]
for element in root.iter("child1", "child2"): ... # traverse specified tags
for element in root.iter(tag=etree.Entity): ... # and more: etree.Element..

# attributes
root = etree.Element("root", interesting="totally")
etree.tostring(root) # =>'<root interesting="totally"/>'
root.get("interesting") # => 'totally'
root.set("hello", "Huhu") # add an attribute
root.keys(), root.items(), root.attrib
# => (['interesting', 'hello'],
# =>  [('interesting', 'totally'), ('hello', 'Huhu')],
# =>  {'interesting': 'totally', 'hello': 'Huhu'})

# text, and tostring
root = etree.Element("root")
root.text = "TEXT" # => '<root>TEXT</root>'
root.tail = "TAIL" # => '<root>TEXT</root>TAIL'
etree.tostring(root, method="text") # no tag, other methods are 'html','xml'
etree.tostring(root, xml_declaration=True) # with declaration
etree.tostring(root, encoding="UTF-8") # modify encoding

# event-driven
for event, element in etree.iterparse('filepath',
                                events=['start','end','start-ns','end-ns']):
    if element.tag == 'xxx': ...
    if event == 'end': ...

# ElementPath
root.iterfind() # iterates over all Elements that match the path expression
root.find() # efficiently returns only the first match
root.findall() # returns a list of matching Elements
root.findtext() # returns the .text content of the first match

# an ugly way to remove namespace
f = open(some_file_like_obj)
tree = ET.XML(f.read().replace(' xmlns=', ' xmlnamespace='))

# map an XML tree into a dict of dicts (of text)
def recursive_dict(element):
     return element.tag, \
            dict(map(recursive_dict, element)) or element.text
	from lxml import etree

	# parse
	parser = etree.XMLParser(ns_clean=True) # support other arguments
	tree = etree.parse(some_file_like_obj)
	# could be file name/path, file-like object, http/ftp url. but
	# name/path and url are faster. or from string as following
	tree = etree.fromstring(string, parser)
	tree = etree.XML('<root><a><b/></a></root>')

	# build a XML tree
	root = etree.Element("root")
	print root.tag # => root
	root.append( etree.Element("child1") )
	child2 = etree.SubElement(root, "child2")
	child3 = etree.SubElement(root, "child3")
	print etree.tostring(root, pretty_print=True)
	# => <root>
	# => <child1/>
	# => <child2/>
	# => <child3/>
	# => </root>

	# traverse the tree
	print root[0].tag # => child1
	print len(root) # => 3
	root.insert(0, etree.Element("child0"))
	for child in root: ... # traverse
	root[0].getparent() # => root
	root[1].getnext() # =>root[2]
	for element in root.iter("child1", "child2"): ... # traverse specified tags
	for element in root.iter(tag=etree.Entity): ... # and more: etree.Element..

	# attributes
	root = etree.Element("root", interesting="totally")
	etree.tostring(root) # =>'<root interesting="totally"/>'
	root.get("interesting") # => 'totally'
	root.set("hello", "Huhu") # add an attribute
	root.keys(), root.items(), root.attrib
	# => (['interesting', 'hello'],
	# => [('interesting', 'totally'), ('hello', 'Huhu')],
	# => {'interesting': 'totally', 'hello': 'Huhu'})

	# text, and tostring
	root = etree.Element("root")
	root.text = "TEXT" # => '<root>TEXT</root>'
	root.tail = "TAIL" # => '<root>TEXT</root>TAIL'
	etree.tostring(root, method="text") # no tag, other methods are 'html','xml'
	etree.tostring(root, xml_declaration=True) # with declaration
	etree.tostring(root, encoding="UTF-8") # modify encoding

	# event-driven
	for event, element in etree.iterparse('filepath',
	events=['start','end','start-ns','end-ns']):
	if element.tag == 'xxx': ...
	if event == 'end': ...

	# ElementPath
	root.iterfind() # iterates over all Elements that match the path expression
	root.find() # efficiently returns only the first match
	root.findall() # returns a list of matching Elements
	root.findtext() # returns the .text content of the first match

	# an ugly way to remove namespace
	f = open(some_file_like_obj)
	tree = ET.XML(f.read().replace(' xmlns=', ' xmlnamespace='))

	# map an XML tree into a dict of dicts (of text)
	def recursive_dict(element):
	return element.tag, \
	dict(map(recursive_dict, element)) or element.text