Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python script to break large XML files
import os
import sys
from xml.sax import parse
from xml.sax.saxutils import XMLGenerator
class CycleFile(object):
def __init__(self, filename):
self.basename, self.ext = os.path.splitext(filename)
self.index = 0
self.open_next_file()
def open_next_file(self):
self.index += 1
self.file = open(self.name(), 'w')
def name(self):
return '%s%s%s' % (self.basename, self.index, self.ext)
def cycle(self):
self.file.close()
self.open_next_file()
def write(self, str):
self.file.write(str)
def close(self):
self.file.close()
class XMLBreaker(XMLGenerator):
def __init__(self, break_into=None, break_after=1000, out=None, *args, **kwargs):
XMLGenerator.__init__(self, out, encoding='utf-8', *args, **kwargs)
self.out_file = out
self.break_into = break_into
self.break_after = break_after
self.context = []
self.count = 0
def startElement(self, name, attrs):
XMLGenerator.startElement(self, name, attrs)
self.context.append((name, attrs))
def endElement(self, name):
XMLGenerator.endElement(self, name)
self.context.pop()
if name == self.break_into:
self.count += 1
if self.count == self.break_after:
self.count = 0
for element in reversed(self.context):
self.out_file.write("\n")
XMLGenerator.endElement(self, element[0])
self.out_file.cycle()
XMLGenerator.startDocument(self)
for element in self.context:
XMLGenerator.startElement(self, *element)
filename, break_into, break_after = sys.argv[1:]
parse(filename, XMLBreaker(break_into, int(break_after), out=CycleFile(filename)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.