Skip to content

Instantly share code, notes, and snippets.

@nasingfaund
Forked from p-jahn/shrink_xml_texts.py
Created April 5, 2023 19:37
Show Gist options
  • Save nasingfaund/77103d1b505ff729f09f028f3ca9af5a to your computer and use it in GitHub Desktop.
Save nasingfaund/77103d1b505ff729f09f028f3ca9af5a to your computer and use it in GitHub Desktop.
reduce text content length in an xml file with a streaming reader
from xml.sax.saxutils import XMLGenerator
from defusedxml.sax import make_parser
from io import BytesIO
class XmlTextShrinker(XMLGenerator):
"""
An extended xml.sax.saxutils.XMLGenerator
"""
def __init__(self, shrink_tags: list, encoding: str = 'UTF-8'):
"""
Initialize the class.
:param shrink_tags: Shrink the text contents of these elements.
:param encoding: Target file encoding
"""
self.shrink_tags = shrink_tags
# flag for 'characters' events
self.is_shrinking = False
# how much data should be retained
self.max_sample_size = 64
# counter for data written in 'characters' event
self.sample_bytes_writen = 0
# in-memory target file
self.output_file = BytesIO()
super().__init__(out=self.output_file, encoding=encoding)
def startElement(self, name, attrs):
"""
Override super classes event handler to set the marker for affected elements
:param name: The name of the handled element
:param attrs: Attributes of the handled element
"""
# Set the shrinking marker. This way, only simple elements containing text are shrunk.
self.is_shrinking = name in self.shrink_tags
if not self.is_shrinking:
# reset the counter, if not currently shrinking
self.sample_bytes_writen = 0
super().startElement(name, attrs)
def endElement(self, name):
"""
Override super classes event handler to set the marker for affected elements
:param name: The name of the handled element
"""
# do not remove text around the element, keep pretty printing etc.
self.is_shrinking = False
super().endElement(name)
def characters(self, content):
"""
Override the super classes event handler to shrink character content, if needed.
:param content: The content, that was read in this event
"""
if self.is_shrinking:
# check if max. data was already emitted
if self.sample_bytes_writen < self.max_sample_size:
# how much data should be emitted in this step
sample_count = self.max_sample_size - self.sample_bytes_writen
content = content[:sample_count]
# increment the counter by the amount of data written
self.sample_bytes_writen += len(content)
else:
# already written max. content, write nothing
content = ''
super().characters(content)
@classmethod
def shrink_file(cls, xml_file: object, shrink_tags: list) -> BytesIO:
"""
Helper to shrink an XML file.
:param xml_file: The file to shrink. Should be a file-like object (filename or stream)
:param shrink_tags: A list of tags to look for. The text content of these tags will be shrinked.
:return: A io.BytesIO object (in-memory file).
"""
# create a parser
parser = make_parser()
# get an instance of this class as the content handler
content_handler = cls(shrink_tags)
parser.setContentHandler(content_handler)
# handle the file
parser.parse(xml_file)
return content_handler.output_file
test_file = r'..\test_files\A2019-02-11_16-57-05.xml'
result_file = XmlTextShrinker.shrink_file(xml_file=test_file, shrink_tags=['Data', 'CipherValue'])
print(str(result_file.getvalue().decode('UTF-8')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment