Skip to content

Instantly share code, notes, and snippets.

@benallard
Last active August 28, 2024 23:00
Show Gist options
  • Save benallard/8042835 to your computer and use it in GitHub Desktop.
Save benallard/8042835 to your computer and use it in GitHub Desktop.
Small python script to split huge XML files into parts. It takes one or two parameters. The first is always the huge XML file, and the second the size of the wished chunks in Kb (default to 1Mb) (0 spilt wherever possible) The generated files are called like the original one with an index between the filename and the extension like that: bigxml.…
#!/usr/bin/env python
import os
import xml.parsers.expat
from xml.sax.saxutils import escape
from optparse import OptionParser
from math import log10
# How much data we process at a time
CHUNK_SIZE = 1024 * 1024
# The sequence of element leading us to the current one
path = []
# How far we are in the current file
cur_size = 0
# From how much should we start another file
MAX_SIZE = 1024*1024 # 1Mb
# The current index
cur_idx = 0
# The current file handle we are writing to
cur_file = None
# The format string used to introduce the index in the file to be written
FMT = ".%d"
# The filename we are playing with
out_dir = None
root = None
ext = None
# The xml declaration of the file.
xml_declaration = None
# What was the signature of the last start element
start = None
# if we are currently in the process of changing file
ending = False
def attrs_s(attrs):
""" This generate the XML attributes from an element attribute list """
l = ['']
for i in range(0,len(attrs), 2):
l.append('%s="%s"' % (attrs[i], escape(attrs[i+1])))
return ' '.join(l)
def next_file():
""" This makes the decision to cut the current file and starta new one """
global cur_size, ending
if (not ending) and (cur_size > MAX_SIZE):
# size above threshold, and not already ending
global cur_file, cur_idx
print "part %d Done" % cur_idx
ending = True
# Close the current elements
for elem in reversed(path):
end_element(elem[0])
# Close the file
cur_file.close()
# reset the size
cur_size = 0
# Open another file
cur_idx += 1
cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext),
'wt')
if xml_declaration is not None:
cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
# Start again where we stopped
for elem in path:
start_element(*elem)
# We are done 'ending'
ending = False
def xml_decl(version, encoding, standalone):
global xml_declaration
l = ['version', version, 'encoding', encoding]
if standalone != -1:
l.extend(['standalone', 'yes' if standalone else 'no'])
xml_declaration = l
cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
def start_element(name, attrs):
""" Called by the parser when he meet a start element """
global cur_size, start
if start is not None:
# Chaining starts after each others
cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
start = (name, attrs)
if ending:
return
cur_size += len(name) + sum(len(k) for k in attrs)
path.append((name, attrs))
def end_element(name):
""" Caled by the parser when he meet an end element """
global cur_size
global start
if start is not None:
# Empty element, good, we did not wrote the start part
cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1])))
else:
# There was some data, close it normaly
cur_file.write('</%s>' % name)
start = None
if ending:
return
elem = path.pop()
assert elem[0] == name
cur_size += len(name)
next_file()
def char_data(data):
""" Called by the parser when he meet data """
global cur_size, start
wroteStart = False
if start is not None:
# The data belong to an element, we should write the start part first
cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
start = None
wroteStart = True
# ``escape`` is too much for us, only & and < ned to be escaped there ...
data = data.replace('&', '&amp;')
data = data.replace('<', '&lt;')
if data == '>':
data = '&gt;'
cur_file.write(data.encode('utf-8'))
cur_size += len(data)
if not wroteStart:
# The data was outside of an element, it could be the right moment to
# make the split
next_file()
def main(filename, output_dir):
# Create a parser
p = xml.parsers.expat.ParserCreate()
# We want to reproduce the input, so we are interested in the order of the
# attributess
p.ordered_attributes = 1
# Set our callbacks (we are stripping comments out by not defining
# callbacks for them)
p.XmlDeclHandler = xml_decl
p.StartElementHandler = start_element
p.EndElementHandler = end_element
p.CharacterDataHandler = char_data
global cur_file, cur_idx
global out_dir, root, ext
global FMT
FMT = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1)
out_dir, filename = os.path.split(filename)
if output_dir is not None:
out_dir = output_dir
root, ext = os.path.splitext(filename)
cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt')
with open(filename, 'rt') as xml_file:
while True:
# Read a chunk
chunk = xml_file.read(CHUNK_SIZE)
if len(chunk) < CHUNK_SIZE:
# End of file
# tell the parser we're done
p.Parse(chunk, 1)
# exit the loop
break
# process the chunk
p.Parse(chunk)
# Don't forget to close our handle
cur_file.close()
print "part %d Done" % cur_idx
if __name__ == "__main__":
parser = OptionParser(usage="usage: %prog [options] XML_FILE")
parser.add_option("-o", "--output-dir",
help="Specify the directory where the xml files will be written" \
"(default to the same directory where the original file is)")
parser.add_option("-M", "--max_size", type="int",
help="Specify the size at which the files should be split (in Kb)")
(options, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
if options.max_size is not None:
MAX_SIZE = options.max_size * 1024
main(args[0], options.output_dir)
@scnctech
Copy link

Thanks! this worked great and saved me quite a bit of time!
Just had to make some changes for my particular encoding needs and file sizes. I'm linking to the one with my changes in case anybody finds it useful.
https://gist.github.com/scnctech/19cd6801f933e85c3c2c58048852b0de

@sobaee
Copy link

sobaee commented Aug 3, 2020

Great job, but It gives me this:

File "xml_split.py", line 56
print "part %d Done" % cur_idx ^
SyntaxError: Missing parentheses in call to 'print'.
Did you mean print("part %d Done" % cur_idx)?

Help please?!

@scnctech
Copy link

scnctech commented Aug 3, 2020

try my version and let me know how it goes: here

@sobaee
Copy link

sobaee commented Aug 3, 2020

@scnctech
This man (scnctech) has resolved all these errors with his copy of this script.
It has worked perfectly in my case.
He saved my day.
Thanks alot

This is the command I used after I get to the directory (cd command) which contain both xml_split.py and my xml file (arwiktionary.xml) :
python xml_split.py arwiktionary.xml -M 40000

The output files are:
arwiktionary.0.xml
arwiktionary.1.xml

Thanks again

@sv-hmelevsky
Copy link

me know how it goes: here

This solution is work perfectly! No problem with big file chunks and encoding!

@sureshpnuri
Copy link

sureshpnuri commented Dec 16, 2023

Didn't work for me. For sample xml file below, my expectation is that it should create 2 xml files and each file should have product node and one review comment node of the product. But the second file it created has product tag, but not the in it. So the second file created is not a valid xml file in my case due to missing Product\Name tag. However, Product opening and closing tags are created fine.

<?xml version='1.0' encoding='utf-8'?>
<root>
	<product>
		<name>Product1</name>
		<review>
			<id>1</id>
			<comment>Magnam ipsum dolore dolorem dolorem sed adipisci sed. Dolore adipisci neque quisquam. Dolor labore dolore porro magnam. Ipsum ipsum ut neque aliquam eius labore porro. Dolor sit etincidunt sit ipsum tempora porro. Quaerat consectetur ut magnam adipisci velit.</comment>
		</review>
		<review>
			<id>2</id>
			<comment>Dolor velit etincidunt eius labore consectetur. Quaerat consectetur dolor quisquam modi adipisci numquam tempora. Est sit neque quisquam labore modi quisquam magnam. Dolorem tempora dolore neque dolorem. Est magnam voluptatem etincidunt dolor eius quaerat velit.</comment>
		</review>
	</product>
</root>

Output I got.

file1

<?xml version='1.0' encoding='utf-8'?>
<root>
	<product>
		<name>Product1</name>
		<review>
			<id>1</id>
			<comment>Magnam ipsum dolore dolorem dolorem sed adipisci sed. Dolore adipisci neque quisquam. Dolor labore dolore porro magnam. Ipsum ipsum ut neque aliquam eius labore porro. Dolor sit etincidunt sit ipsum tempora porro. Quaerat consectetur ut magnam adipisci velit.</comment>
		</review>
	</product>
</root>

file2 (missing Product\Name

<?xml version='1.0' encoding='utf-8'?>
<root>
	<product>
		<review>
			<id>2</id>
			<comment>Dolor velit etincidunt eius labore consectetur. Quaerat consectetur dolor quisquam modi adipisci numquam tempora. Est sit neque quisquam labore modi quisquam magnam. Dolorem tempora dolore neque dolorem. Est magnam voluptatem etincidunt dolor eius quaerat velit.</comment>
		</review>
	</product>
</root>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment