Last active
December 22, 2015 23:49
-
-
Save junjiah/6549375 to your computer and use it in GitHub Desktop.
A brief tutorial to lxml module
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
# parse | |
parser = etree.XMLParser(ns_clean=True) # support other arguments | |
tree = etree.parse(some_file_like_obj) | |
# could be file name/path, file-like object, http/ftp url. but | |
# name/path and url are faster. or from string as following | |
tree = etree.fromstring(string, parser) | |
tree = etree.XML('<root><a><b/></a></root>') | |
# build a XML tree | |
root = etree.Element("root") | |
print root.tag # => root | |
root.append( etree.Element("child1") ) | |
child2 = etree.SubElement(root, "child2") | |
child3 = etree.SubElement(root, "child3") | |
print etree.tostring(root, pretty_print=True) | |
# => <root> | |
# => <child1/> | |
# => <child2/> | |
# => <child3/> | |
# => </root> | |
# traverse the tree | |
print root[0].tag # => child1 | |
print len(root) # => 3 | |
root.insert(0, etree.Element("child0")) | |
for child in root: ... # traverse | |
root[0].getparent() # => root | |
root[1].getnext() # =>root[2] | |
for element in root.iter("child1", "child2"): ... # traverse specified tags | |
for element in root.iter(tag=etree.Entity): ... # and more: etree.Element.. | |
# attributes | |
root = etree.Element("root", interesting="totally") | |
etree.tostring(root) # =>'<root interesting="totally"/>' | |
root.get("interesting") # => 'totally' | |
root.set("hello", "Huhu") # add an attribute | |
root.keys(), root.items(), root.attrib | |
# => (['interesting', 'hello'], | |
# => [('interesting', 'totally'), ('hello', 'Huhu')], | |
# => {'interesting': 'totally', 'hello': 'Huhu'}) | |
# text, and tostring | |
root = etree.Element("root") | |
root.text = "TEXT" # => '<root>TEXT</root>' | |
root.tail = "TAIL" # => '<root>TEXT</root>TAIL' | |
etree.tostring(root, method="text") # no tag, other methods are 'html','xml' | |
etree.tostring(root, xml_declaration=True) # with declaration | |
etree.tostring(root, encoding="UTF-8") # modify encoding | |
# event-driven | |
for event, element in etree.iterparse('filepath', | |
events=['start','end','start-ns','end-ns']): | |
if element.tag == 'xxx': ... | |
if event == 'end': ... | |
# ElementPath | |
root.iterfind() # iterates over all Elements that match the path expression | |
root.find() # efficiently returns only the first match | |
root.findall() # returns a list of matching Elements | |
root.findtext() # returns the .text content of the first match | |
# an ugly way to remove namespace | |
f = open(some_file_like_obj) | |
tree = ET.XML(f.read().replace(' xmlns=', ' xmlnamespace=')) | |
# map an XML tree into a dict of dicts (of text) | |
def recursive_dict(element): | |
return element.tag, \ | |
dict(map(recursive_dict, element)) or element.text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment