Skip to content

Instantly share code, notes, and snippets.

Forked from benallard/
Last active September 2, 2017 14:07
Show Gist options
  • Save borowis/ea00c68bfb2c73fd8e64e9378b18c277 to your computer and use it in GitHub Desktop.
Save borowis/ea00c68bfb2c73fd8e64e9378b18c277 to your computer and use it in GitHub Desktop.
``./ -M 8092 --split_on_tag "tu" big.xml`` --> split potentially huge big.xml into approximately 8 Mb chunks with names big.0.xml, big.1.xml etc. Split only on </tu> tags (can be omitted if you do not care).
#!/usr/bin/env python
import os
import xml.parsers.expat
from xml.sax.saxutils import escape
from optparse import OptionParser
from math import log10
# How much data we process at a time
CHUNK_SIZE = 1024 * 1024
# The sequence of element leading us to the current one
path = []
# How far we are in the current file
cur_size = 0
# From how much should we start another file
MAX_SIZE = 1024*1024 # 1Mb
# The current index
cur_idx = 0
# The current file handle we are writing to
cur_file = None
# The format string used to introduce the index in the file to be written
FMT = ".%d"
# The filename we are playing with
out_dir = None
root = None
ext = None
# The xml declaration of the file.
xml_declaration = None
# What was the signature of the last start element
start = None
# Tag on which file should be split if gt than MAX_SIZE
split_tag = None
# if we are currently in the process of changing file
ending = False
def attrs_s(attrs):
""" This generate the XML attributes from an element attribute list """
l = ['']
for i in range(0,len(attrs), 2):
l.append('%s="%s"' % (attrs[i], escape(attrs[i+1])))
return ' '.join(l)
def next_file(end_element_name = None):
""" This makes the decision to cut the current file and start a new one """
global cur_size, ending
should_split_on_end_element_name = True if split_tag is None else end_element_name == split_tag
if (not ending) and (cur_size > MAX_SIZE) and should_split_on_end_element_name:
# size above threshold, and not already ending
global cur_file, cur_idx
print "part %d Done" % cur_idx
ending = True
# Close the current elements
if not DEBUG_MODE:
for elem in reversed(path):
# Close the file
# reset the size
cur_size = 0
# Open another file
cur_idx += 1
cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt')
if not DEBUG_MODE:
if xml_declaration is not None:
cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
# Start again where we stopped
for elem in path:
# We are done 'ending'
ending = False
def xml_decl(version, encoding, standalone):
global xml_declaration
l = ['version', version, 'encoding', encoding]
if standalone != -1:
l.extend(['standalone', 'yes' if standalone else 'no'])
xml_declaration = l
cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
def start_element(name, attrs):
""" Called by the parser when he meet a start element """
global cur_size, start
if start is not None:
# Chaining starts after each others
cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
start = (name, attrs)
if ending:
cur_size += len(name) + sum(len(k) for k in attrs)
path.append((name, attrs))
def end_element(name):
""" Caled by the parser when he meet an end element """
global cur_size
global start
if start is not None:
# Empty element, good, we did not wrote the start part
cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1])))
# There was some data, close it normaly
cur_file.write('</%s>' % name)
start = None
if ending:
elem = path.pop()
assert elem[0] == name
cur_size += len(name)
def char_data(data):
""" Called by the parser when he meet data """
global cur_size, start
wroteStart = False
if start is not None:
# The data belong to an element, we should write the start part first
cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
start = None
wroteStart = True
# ``escape`` is too much for us, only & and < ned to be escaped there ...
data = data.replace('&', '&amp;')
data = data.replace('<', '&lt;')
if data == '>':
data = '&gt;'
cur_size += len(data)
if not wroteStart:
# The data was outside of an element, it could be the right moment to
# make the split
def main(filename, output_dir):
# Create a parser
p = xml.parsers.expat.ParserCreate()
# We want to reproduce the input, so we are interested in the order of the
# attributess
p.ordered_attributes = 1
# Set our callbacks (we are stripping comments out by not defining
# callbacks for them)
p.XmlDeclHandler = xml_decl
p.StartElementHandler = start_element
p.EndElementHandler = end_element
p.CharacterDataHandler = char_data
global cur_file, cur_idx
global out_dir, root, ext
global FMT
FMT = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1)
out_dir, filename = os.path.split(filename)
if output_dir is not None:
out_dir = output_dir
root, ext = os.path.splitext(filename)
cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt')
with open(filename, 'rt') as xml_file:
while True:
# Read a chunk
chunk =
if len(chunk) < CHUNK_SIZE:
# End of file
# tell the parser we're done
p.Parse(chunk, 1)
# exit the loop
# process the chunk
# Don't forget to close our handle
print "part %d Done" % cur_idx
if __name__ == "__main__":
parser = OptionParser(usage="usage: %prog [options] XML_FILE")
parser.add_option("-o", "--output-dir",
help="Specify the directory where the xml files will be written" \
"(default to the same directory where the original file is)")
parser.add_option("-M", "--max_size", type="int",
help="Specify the size at which the files should be split (in Kb)")
parser.add_option("--debug", action="store_true", help="Do not copy headers/footers. Useful for verifying split was correct")
parser.add_option("--split_on_tag", help="Split only on some specific tags")
(options, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
if options.max_size is not None:
MAX_SIZE = options.max_size * 1024
print "Requested size = %s (bytes)" % MAX_SIZE
if options.debug is not None:
DEBUG_MODE = options.debug
if options.split_on_tag is not None:
split_tag = options.split_on_tag
print "Split on %s" % split_tag
main(args[0], options.output_dir)
Copy link

borowis commented Apr 22, 2016

Extended original solution:

  1. --debug -- do not insert headers / footers, just do the split. allowed me to verify that the line_counts, character_counts match
  2. --split_on_tag -- do not break tags in the middle. I was working on translation memory files (TMX) so it was important for me not to break tag in the middle.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment