Skip to content

Instantly share code, notes, and snippets.

@razamatan
Created March 10, 2011 01:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save razamatan/863397 to your computer and use it in GitHub Desktop.
Save razamatan/863397 to your computer and use it in GitHub Desktop.
simple conversion from etree parsed xml to objects
''' xml utilities '''
__author__ = 'jin@recessnetworks.net'
from itertools import groupby
from operator import attrgetter
def split_ns(txt):
''' returns [ns, tag] for '{ns}tag'
>>> split_ns('{http://www.w3.org/2001/XMLSchema-instance}nil')
['http://www.w3.org/2001/XMLSchema-instance', 'nil']
>>> split_ns('foobar')
['', 'foobar']
'''
s = txt[txt.startswith('{'):].split('}', 1)
return s if len(s) > 1 else [''] + s
class protected(dict):
''' raises exception if you try to update an existing key
>>> x = protected(zip('abcd', '1234'))
>>> x['a'] = 3
Traceback (most recent call last):
KeyError: ('already exists', 'a')
>>> x['e'] = 5
'''
def __setitem__(self, k, v):
if k in self: raise KeyError('already exists', k)
dict.__setitem__(self, k, v)
def objectify(elm, prefix='', base=None):
''' converts a parsed etree element into an object.
if prefix is specified (str), it will prefix all the tags it inserts into
the base. usefuly for preventing namespace collisions.
when base is specified, it will create a heirarchy of bases to represent the
parse. base needs to implement MutableMapping (e.g. dict, OrderedDict,
etc.). if base is unspecified, it **WILL MODIFY THE ETREE ELEMENT OBJECTS**
it traverses by adding attributes to them. WARNING: if it wasn't clear,
this is potentially quite destructive on the etree elements!
see dictify to for an example.
'''
#print '--', elm.tag, elm
rval = elm.__dict__ if base is None else base()
# text
text = elm.text.strip() if elm.text else elm.text
tail = elm.tail.strip() if elm.tail else elm.tail
if not (len(elm) or elm.attrib or tail): return text
if text: rval[prefix + 'text'] = text
if tail: rval[prefix + 'tail'] = tail
# attributes
rval.update((prefix + split_ns(k)[1], v) for k,v in elm.attrib.items())
# test for nil
if rval.get(prefix + 'nil') == 'true': return None
# children
c_groups = [ (split_ns(k)[1], [objectify(i, prefix, base) for i in g]) for k,g in groupby(list(elm), attrgetter('tag')) ]
for t, c in c_groups:
if len(c) > 1 and len(c_groups) == 1 and not (elm.attrib or elm.text):
# many children of one type and nothing else
return c
rval[prefix + t] = c if len(c) > 1 else c[0]
return elm if base is None else rval
def dictify(elm, prefix='', base=protected):
''' returns a dict representation of the xml '''
return objectify(elm, prefix, base)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment