Skip to content

Instantly share code, notes, and snippets.

@katyukha
Created May 17, 2016 13:17
Show Gist options
  • Save katyukha/3f5695c391f65216287ae67f567a57df to your computer and use it in GitHub Desktop.
Save katyukha/3f5695c391f65216287ae67f567a57df to your computer and use it in GitHub Desktop.
Monkeypatch xmlrpclib to use lxml parser. It have better memory performance on large xml entities on Python 2.7
import xmlrpclib
try:
from lxml.etree import XMLPullParser
except ImportError:
LXMLParser = None
else:
class LXMLParser:
def __init__(self, target):
self._parser = XMLPullParser(events=('start', 'end'),
huge_tree=True)
self._target = target
def handle_events(self):
for action, element in self._parser.read_events():
if action == 'start':
self._target.start(element.tag, element.attrib)
elif action == 'end':
if element.text:
self._target.data(element.text)
self._target.end(element.tag)
element.clear()
def feed(self, data):
try:
self._parser.feed(data)
except:
print("FEED Error: %r" % data)
raise
self.handle_events()
def close(self):
self._parser.close()
def getparser(use_datetime=0):
"""getparser() -> parser, unmarshaller
Create an instance of the fastest available parser, and attach it
to an unmarshalling object. Return both objects.
"""
target = xmlrpclib.Unmarshaller(use_datetime=use_datetime)
parser = LXMLParser(target)
return parser, target
xmlrpclib.getparser = getparser
@katyukha
Copy link
Author

  • Python 2.7 pyexpat parser eats all memory on big (100MB) xml content.
  • lxml seems have better memory performance
  • Python 3.5 pyexpat seems to work good

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment