-
-
Save nasingfaund/77103d1b505ff729f09f028f3ca9af5a to your computer and use it in GitHub Desktop.
reduce text content length in an xml file with a streaming reader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xml.sax.saxutils import XMLGenerator | |
from defusedxml.sax import make_parser | |
from io import BytesIO | |
class XmlTextShrinker(XMLGenerator): | |
""" | |
An extended xml.sax.saxutils.XMLGenerator | |
""" | |
def __init__(self, shrink_tags: list, encoding: str = 'UTF-8'): | |
""" | |
Initialize the class. | |
:param shrink_tags: Shrink the text contents of these elements. | |
:param encoding: Target file encoding | |
""" | |
self.shrink_tags = shrink_tags | |
# flag for 'characters' events | |
self.is_shrinking = False | |
# how much data should be retained | |
self.max_sample_size = 64 | |
# counter for data written in 'characters' event | |
self.sample_bytes_writen = 0 | |
# in-memory target file | |
self.output_file = BytesIO() | |
super().__init__(out=self.output_file, encoding=encoding) | |
def startElement(self, name, attrs): | |
""" | |
Override super classes event handler to set the marker for affected elements | |
:param name: The name of the handled element | |
:param attrs: Attributes of the handled element | |
""" | |
# Set the shrinking marker. This way, only simple elements containing text are shrunk. | |
self.is_shrinking = name in self.shrink_tags | |
if not self.is_shrinking: | |
# reset the counter, if not currently shrinking | |
self.sample_bytes_writen = 0 | |
super().startElement(name, attrs) | |
def endElement(self, name): | |
""" | |
Override super classes event handler to set the marker for affected elements | |
:param name: The name of the handled element | |
""" | |
# do not remove text around the element, keep pretty printing etc. | |
self.is_shrinking = False | |
super().endElement(name) | |
def characters(self, content): | |
""" | |
Override the super classes event handler to shrink character content, if needed. | |
:param content: The content, that was read in this event | |
""" | |
if self.is_shrinking: | |
# check if max. data was already emitted | |
if self.sample_bytes_writen < self.max_sample_size: | |
# how much data should be emitted in this step | |
sample_count = self.max_sample_size - self.sample_bytes_writen | |
content = content[:sample_count] | |
# increment the counter by the amount of data written | |
self.sample_bytes_writen += len(content) | |
else: | |
# already written max. content, write nothing | |
content = '' | |
super().characters(content) | |
@classmethod | |
def shrink_file(cls, xml_file: object, shrink_tags: list) -> BytesIO: | |
""" | |
Helper to shrink an XML file. | |
:param xml_file: The file to shrink. Should be a file-like object (filename or stream) | |
:param shrink_tags: A list of tags to look for. The text content of these tags will be shrinked. | |
:return: A io.BytesIO object (in-memory file). | |
""" | |
# create a parser | |
parser = make_parser() | |
# get an instance of this class as the content handler | |
content_handler = cls(shrink_tags) | |
parser.setContentHandler(content_handler) | |
# handle the file | |
parser.parse(xml_file) | |
return content_handler.output_file | |
test_file = r'..\test_files\A2019-02-11_16-57-05.xml' | |
result_file = XmlTextShrinker.shrink_file(xml_file=test_file, shrink_tags=['Data', 'CipherValue']) | |
print(str(result_file.getvalue().decode('UTF-8'))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment