Skip to content

Instantly share code, notes, and snippets.

@BoboTiG
Created May 6, 2020 18:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BoboTiG/af7780b10e458c5ca1e8ad5a2e7e52e1 to your computer and use it in GitHub Desktop.
Save BoboTiG/af7780b10e458c5ca1e8ad5a2e7e52e1 to your computer and use it in GitHub Desktop.
[Python] XML parser benchmark
"""
http://www.tiger-222.fr/index.php?d=2020/05/03/12/59/13
"""
import xml.etree.ElementTree as ET
import xml.sax
import xml.sax.handler
from datetime import timedelta
from time import monotonic
from typing import Any, Dict, Generator
import xmltodict
def timer(func):
def inner(file):
start = monotonic()
try:
func(file)
finally:
end = monotonic()
delta = timedelta(seconds=end - start)
print(f"{func.__name__}(): {delta}", flush=True)
return inner
#
# --- xmltodict
#
def handle_element(_, page: Dict[str, Any]) -> bool:
return True
@timer
def f1(file: str) -> None:
with open(file, "rb") as fh:
xmltodict.parse(fh, item_depth=2, item_callback=handle_element)
#
# --- ElementTree
#
def xml_iter_parse(file: str) -> Generator[ET.Element, None, None]:
tag_page = "{http://www.mediawiki.org/xml/export-0.10/}page"
start_tag = None
doc = ET.iterparse(file, events=("start", "end"))
_, root = next(doc)
for event, element in doc:
if start_tag is None and event == "start" and element.tag == tag_page:
start_tag = element.tag
elif start_tag is not None and event == "end" and element.tag == start_tag:
yield element
start_tag = None
root.clear() # Keep memory low
def xml_parse_element(element: ET.Element) -> None:
return None
@timer
def f2(file: str) -> None:
for element in xml_iter_parse(file):
xml_parse_element(element)
#
# --- SAX2
#
class PageHandler(xml.sax.handler.ContentHandler):
__slots__ = ("_current_tag", "_is_page", "code", "word", "words")
def __init__(self) -> None:
self.code = ""
self.word = ""
self._current_tag = ""
self._is_page = False
def reset(self) -> None:
self.code = ""
self.word = ""
self._is_page = False
def startElement(self, tag: str, attributes: Dict[str, str]) -> None:
if tag == "page":
self._is_page = True
elif tag in ("title", "text"):
self._current_tag = tag
def endElement(self, tag: str) -> None:
if self._current_tag == "text":
if self.word:
self.process()
self.reset()
self._current_tag = ""
def process(self) -> None:
pass
def characters(self, content: str) -> None:
if not self._is_page:
pass
elif self._current_tag == "title":
self.word = content
elif self._current_tag == "text":
self.code += content
@timer
def f3(file: str) -> None:
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_string_interning, 1)
handler = PageHandler()
parser.setContentHandler(handler)
parser.parse(file)
# https://dumps.wikimedia.org/frwiktionary/20200401/frwiktionary-20200401-pages-meta-current.xml.bz2
# 4,107,154 <page> elements => 4,107,154 words to handle
file = "data/fr/pages-20200420.xml"
f1(file) # 0:07:27
f2(file) # 0:02:08
f3(file) # 0:05:46
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment