Skip to content

Instantly share code, notes, and snippets.

@paxan
Created October 24, 2012 19:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paxan/3948251 to your computer and use it in GitHub Desktop.
Save paxan/3948251 to your computer and use it in GitHub Desktop.
Example: stream oriented parsing of XML (pretty hacky!)
# Remixed from: http://www.dabeaz.com/coroutines/cosax.py
# Don't shy away from reading http://www.dabeaz.com/coroutines/ if the stuff below
# seems super weird.
from xml.sax import ContentHandler, parse
from collections import namedtuple
ElementStart = namedtuple('ElementStart', 'name attrs')
ElementEnd = namedtuple('ElementEnd', 'name')
class EventHandler(ContentHandler):
def __init__(self, target):
self.target = target
def startElement(self, name, attrs):
self.target.send(ElementStart(name, attrs._attrs))
def endElement(self, name):
self.target.send(ElementEnd(name))
def pull_tuples(tuples):
while True:
event = yield
if isinstance(event, ElementStart) and event.name == 'vehicle':
vid = event.attrs['id']
edges = None
while True:
event = yield
if isinstance(event, ElementStart) and event.name == 'route':
edges = event.attrs['edges']
elif isinstance(event, ElementEnd) and event.name == 'vehicle':
tuples.append((vid, edges))
break
if __name__ == '__main__':
from StringIO import StringIO
results = []
puller = pull_tuples(results)
puller.next()
x = """\
<root><foo />
<bar>
<vehicle id="1292442" depart="26060.00">
<route edges="24449167#2 27659684#1 24686876#1"/>
</vehicle>
</bar>
<vehicle id="1292443" depart="26060.00">
<route edges="24449167#2 27659684#1 24686876#1"/>
</vehicle>
<vehicle id="1292444" depart="26060.00">
<route edges="24449167#2 27659684#1 24686876#1"/>
</vehicle>
</root>
"""
parse(StringIO(x), EventHandler(puller))
print results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment