Skip to content

Instantly share code, notes, and snippets.

@zstarpak
Forked from nicwolff/XML_breaker.py
Created November 22, 2020 08:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zstarpak/b9a25078a87ffee378462fb30ea370b4 to your computer and use it in GitHub Desktop.
Save zstarpak/b9a25078a87ffee378462fb30ea370b4 to your computer and use it in GitHub Desktop.
Python script to break large XML files
import os
import sys
from xml.sax import parse
from xml.sax.saxutils import XMLGenerator
class CycleFile(object):
def __init__(self, filename):
self.basename, self.ext = os.path.splitext(filename)
self.index = 0
self.open_next_file()
def open_next_file(self):
self.index += 1
self.file = open(self.name(), 'w')
def name(self):
return '%s%s%s' % (self.basename, self.index, self.ext)
def cycle(self):
self.file.close()
self.open_next_file()
def write(self, str):
self.file.write(str)
def close(self):
self.file.close()
class XMLBreaker(XMLGenerator):
def __init__(self, break_into=None, break_after=1000, out=None, *args, **kwargs):
XMLGenerator.__init__(self, out, encoding='utf-8', *args, **kwargs)
self.out_file = out
self.break_into = break_into
self.break_after = break_after
self.context = []
self.count = 0
def startElement(self, name, attrs):
XMLGenerator.startElement(self, name, attrs)
self.context.append((name, attrs))
def endElement(self, name):
XMLGenerator.endElement(self, name)
self.context.pop()
if name == self.break_into:
self.count += 1
if self.count == self.break_after:
self.count = 0
for element in reversed(self.context):
self.out_file.write("\n")
XMLGenerator.endElement(self, element[0])
self.out_file.cycle()
XMLGenerator.startDocument(self)
for element in self.context:
XMLGenerator.startElement(self, *element)
filename, break_into, break_after = sys.argv[1:]
parse(filename, XMLBreaker(break_into, int(break_after), out=CycleFile(filename)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment