Created
May 17, 2016 13:17
-
-
Save katyukha/3f5695c391f65216287ae67f567a57df to your computer and use it in GitHub Desktop.
Monkeypatch xmlrpclib to use lxml parser. It have better memory performance on large xml entities on Python 2.7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xmlrpclib | |
try: | |
from lxml.etree import XMLPullParser | |
except ImportError: | |
LXMLParser = None | |
else: | |
class LXMLParser: | |
def __init__(self, target): | |
self._parser = XMLPullParser(events=('start', 'end'), | |
huge_tree=True) | |
self._target = target | |
def handle_events(self): | |
for action, element in self._parser.read_events(): | |
if action == 'start': | |
self._target.start(element.tag, element.attrib) | |
elif action == 'end': | |
if element.text: | |
self._target.data(element.text) | |
self._target.end(element.tag) | |
element.clear() | |
def feed(self, data): | |
try: | |
self._parser.feed(data) | |
except: | |
print("FEED Error: %r" % data) | |
raise | |
self.handle_events() | |
def close(self): | |
self._parser.close() | |
def getparser(use_datetime=0): | |
"""getparser() -> parser, unmarshaller | |
Create an instance of the fastest available parser, and attach it | |
to an unmarshalling object. Return both objects. | |
""" | |
target = xmlrpclib.Unmarshaller(use_datetime=use_datetime) | |
parser = LXMLParser(target) | |
return parser, target | |
xmlrpclib.getparser = getparser | |
Author
katyukha
commented
May 17, 2016
- Python 2.7 pyexpat parser eats all memory on big (100MB) xml content.
- lxml seems have better memory performance
- Python 3.5 pyexpat seems to work good
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment