Created
May 6, 2020 18:50
-
-
Save BoboTiG/af7780b10e458c5ca1e8ad5a2e7e52e1 to your computer and use it in GitHub Desktop.
[Python] XML parser benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
http://www.tiger-222.fr/index.php?d=2020/05/03/12/59/13 | |
""" | |
import xml.etree.ElementTree as ET | |
import xml.sax | |
import xml.sax.handler | |
from datetime import timedelta | |
from time import monotonic | |
from typing import Any, Dict, Generator | |
import xmltodict | |
def timer(func): | |
def inner(file): | |
start = monotonic() | |
try: | |
func(file) | |
finally: | |
end = monotonic() | |
delta = timedelta(seconds=end - start) | |
print(f"{func.__name__}(): {delta}", flush=True) | |
return inner | |
# | |
# --- xmltodict | |
# | |
def handle_element(_, page: Dict[str, Any]) -> bool: | |
return True | |
@timer | |
def f1(file: str) -> None: | |
with open(file, "rb") as fh: | |
xmltodict.parse(fh, item_depth=2, item_callback=handle_element) | |
# | |
# --- ElementTree | |
# | |
def xml_iter_parse(file: str) -> Generator[ET.Element, None, None]: | |
tag_page = "{http://www.mediawiki.org/xml/export-0.10/}page" | |
start_tag = None | |
doc = ET.iterparse(file, events=("start", "end")) | |
_, root = next(doc) | |
for event, element in doc: | |
if start_tag is None and event == "start" and element.tag == tag_page: | |
start_tag = element.tag | |
elif start_tag is not None and event == "end" and element.tag == start_tag: | |
yield element | |
start_tag = None | |
root.clear() # Keep memory low | |
def xml_parse_element(element: ET.Element) -> None: | |
return None | |
@timer | |
def f2(file: str) -> None: | |
for element in xml_iter_parse(file): | |
xml_parse_element(element) | |
# | |
# --- SAX2 | |
# | |
class PageHandler(xml.sax.handler.ContentHandler): | |
__slots__ = ("_current_tag", "_is_page", "code", "word", "words") | |
def __init__(self) -> None: | |
self.code = "" | |
self.word = "" | |
self._current_tag = "" | |
self._is_page = False | |
def reset(self) -> None: | |
self.code = "" | |
self.word = "" | |
self._is_page = False | |
def startElement(self, tag: str, attributes: Dict[str, str]) -> None: | |
if tag == "page": | |
self._is_page = True | |
elif tag in ("title", "text"): | |
self._current_tag = tag | |
def endElement(self, tag: str) -> None: | |
if self._current_tag == "text": | |
if self.word: | |
self.process() | |
self.reset() | |
self._current_tag = "" | |
def process(self) -> None: | |
pass | |
def characters(self, content: str) -> None: | |
if not self._is_page: | |
pass | |
elif self._current_tag == "title": | |
self.word = content | |
elif self._current_tag == "text": | |
self.code += content | |
@timer | |
def f3(file: str) -> None: | |
parser = xml.sax.make_parser() | |
parser.setFeature(xml.sax.handler.feature_string_interning, 1) | |
handler = PageHandler() | |
parser.setContentHandler(handler) | |
parser.parse(file) | |
# https://dumps.wikimedia.org/frwiktionary/20200401/frwiktionary-20200401-pages-meta-current.xml.bz2 | |
# 4,107,154 <page> elements => 4,107,154 words to handle | |
file = "data/fr/pages-20200420.xml" | |
f1(file) # 0:07:27 | |
f2(file) # 0:02:08 | |
f3(file) # 0:05:46 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment