-
-
Save benallard/8042835 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
import os | |
import xml.parsers.expat | |
from xml.sax.saxutils import escape | |
from optparse import OptionParser | |
from math import log10 | |
# How much data we process at a time | |
CHUNK_SIZE = 1024 * 1024 | |
# The sequence of element leading us to the current one | |
path = [] | |
# How far we are in the current file | |
cur_size = 0 | |
# From how much should we start another file | |
MAX_SIZE = 1024*1024 # 1Mb | |
# The current index | |
cur_idx = 0 | |
# The current file handle we are writing to | |
cur_file = None | |
# The format string used to introduce the index in the file to be written | |
FMT = ".%d" | |
# The filename we are playing with | |
out_dir = None | |
root = None | |
ext = None | |
# The xml declaration of the file. | |
xml_declaration = None | |
# What was the signature of the last start element | |
start = None | |
# if we are currently in the process of changing file | |
ending = False | |
def attrs_s(attrs): | |
""" This generate the XML attributes from an element attribute list """ | |
l = [''] | |
for i in range(0,len(attrs), 2): | |
l.append('%s="%s"' % (attrs[i], escape(attrs[i+1]))) | |
return ' '.join(l) | |
def next_file(): | |
""" This makes the decision to cut the current file and starta new one """ | |
global cur_size, ending | |
if (not ending) and (cur_size > MAX_SIZE): | |
# size above threshold, and not already ending | |
global cur_file, cur_idx | |
print "part %d Done" % cur_idx | |
ending = True | |
# Close the current elements | |
for elem in reversed(path): | |
end_element(elem[0]) | |
# Close the file | |
cur_file.close() | |
# reset the size | |
cur_size = 0 | |
# Open another file | |
cur_idx += 1 | |
cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), | |
'wt') | |
if xml_declaration is not None: | |
cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration)) | |
# Start again where we stopped | |
for elem in path: | |
start_element(*elem) | |
# We are done 'ending' | |
ending = False | |
def xml_decl(version, encoding, standalone): | |
global xml_declaration | |
l = ['version', version, 'encoding', encoding] | |
if standalone != -1: | |
l.extend(['standalone', 'yes' if standalone else 'no']) | |
xml_declaration = l | |
cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration)) | |
def start_element(name, attrs): | |
""" Called by the parser when he meet a start element """ | |
global cur_size, start | |
if start is not None: | |
# Chaining starts after each others | |
cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) | |
start = (name, attrs) | |
if ending: | |
return | |
cur_size += len(name) + sum(len(k) for k in attrs) | |
path.append((name, attrs)) | |
def end_element(name): | |
""" Caled by the parser when he meet an end element """ | |
global cur_size | |
global start | |
if start is not None: | |
# Empty element, good, we did not wrote the start part | |
cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1]))) | |
else: | |
# There was some data, close it normaly | |
cur_file.write('</%s>' % name) | |
start = None | |
if ending: | |
return | |
elem = path.pop() | |
assert elem[0] == name | |
cur_size += len(name) | |
next_file() | |
def char_data(data): | |
""" Called by the parser when he meet data """ | |
global cur_size, start | |
wroteStart = False | |
if start is not None: | |
# The data belong to an element, we should write the start part first | |
cur_file.write('<%s%s>' % (start[0], attrs_s(start[1]))) | |
start = None | |
wroteStart = True | |
# ``escape`` is too much for us, only & and < ned to be escaped there ... | |
data = data.replace('&', '&') | |
data = data.replace('<', '<') | |
if data == '>': | |
data = '>' | |
cur_file.write(data.encode('utf-8')) | |
cur_size += len(data) | |
if not wroteStart: | |
# The data was outside of an element, it could be the right moment to | |
# make the split | |
next_file() | |
def main(filename, output_dir): | |
# Create a parser | |
p = xml.parsers.expat.ParserCreate() | |
# We want to reproduce the input, so we are interested in the order of the | |
# attributess | |
p.ordered_attributes = 1 | |
# Set our callbacks (we are stripping comments out by not defining | |
# callbacks for them) | |
p.XmlDeclHandler = xml_decl | |
p.StartElementHandler = start_element | |
p.EndElementHandler = end_element | |
p.CharacterDataHandler = char_data | |
global cur_file, cur_idx | |
global out_dir, root, ext | |
global FMT | |
FMT = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1) | |
out_dir, filename = os.path.split(filename) | |
if output_dir is not None: | |
out_dir = output_dir | |
root, ext = os.path.splitext(filename) | |
cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt') | |
with open(filename, 'rt') as xml_file: | |
while True: | |
# Read a chunk | |
chunk = xml_file.read(CHUNK_SIZE) | |
if len(chunk) < CHUNK_SIZE: | |
# End of file | |
# tell the parser we're done | |
p.Parse(chunk, 1) | |
# exit the loop | |
break | |
# process the chunk | |
p.Parse(chunk) | |
# Don't forget to close our handle | |
cur_file.close() | |
print "part %d Done" % cur_idx | |
if __name__ == "__main__": | |
parser = OptionParser(usage="usage: %prog [options] XML_FILE") | |
parser.add_option("-o", "--output-dir", | |
help="Specify the directory where the xml files will be written" \ | |
"(default to the same directory where the original file is)") | |
parser.add_option("-M", "--max_size", type="int", | |
help="Specify the size at which the files should be split (in Kb)") | |
(options, args) = parser.parse_args() | |
if len(args) != 1: | |
parser.error("incorrect number of arguments") | |
if options.max_size is not None: | |
MAX_SIZE = options.max_size * 1024 | |
main(args[0], options.output_dir) |
Thank you so much!!! Works perfectly!!
I wasnt able to get it to work for custom size but default size works just fine 😃
This saved me the trouble of writing another Java program.
On line 160 the filename variable gets overwritten, this causes the open(filename) to fail on line 168 when you point to a source file that is not in the current directory. Change the name for filename on line 140, 160 (2nd one) and 168 to fix this.
mhasan2k, hope you found a solution, if not try this https://gist.github.com/nicwolff/b4da6ec84ba9c23c8e59 which accepts the name of the element to split on, and the number of them to split at, and puts the declaration at the top of each file. And is a lot less code, since it just subclasses xml.sax.saxutils.XMLGenerator.
I'm sorry i'm totally new to python and hope to know to run the script
I have 3000 sitemaps in one folder and want to split into smaller ones. so what command lines i should do?
I installed python on c drive then use this method
http://stackoverflow.com/questions/19779986/how-to-run-a-py-file-in-windows-command-line
but it give me error in line 56
In line 47, if attribute value is None
the xml saxutils will generate an AttributeError like AttributeError: 'NoneType' object has no attribute 'replace'
. I've fixed it by adding a check to the attribute value: https://gist.github.com/vittoriozamboni/936d37b761180107127a/revisions
Hi everyone.... I'm completely beginner to the python... so pls help me
When i run the above code in python 3.5.1 on windows, it is giving the following error... The code file is XMLSPLIT.py on the desktop and the XML file to be splitted in also on the desktop as dblp... i understood the error that i havent given any arguments ... so i am not getting how to give the dblp XML file as input to be splitted.... So kindly pls assist me... Thank u all
Usage: XMLSPLIT.py [options] XML_FILE
XMLSPLIT.py: error: incorrect number of arguments
I also got "incorrect number of arguments error".Please help.
Don't work with some utf-8 in xml ('UnicodeEncodeError: 'ascii' codec can't encode characters in position...')
Works great! Thanks a lot.
Don't work with some utf-8 in xml ('UnicodeEncodeError: 'ascii' codec can't encode characters in position...')
Same thing happened to me, I created a new encode function and made sure everything i passed to cur_file.write(
used it. Function below:
def encode(str): return str.encode('utf-8').strip()
I also got "incorrect number of arguments error".Please help.
python xml_split.py <file name to be split> -M <split file size kb> -o <output path>
This would have been great if it worked. I gave up trying to pass it a second option and just gave it the name of the large XML file, and still got an error:
Traceback (most recent call last):
File "xmlsplit.py", line 198, in
main(args[0], options.output_dir)
File "xmlsplit.py", line 179, in main
p.Parse(chunk)
File "xmlsplit.py", line 106, in end_element
cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1])))
UnicodeEncodeError: 'ascii' codec can't encode character u'\xb2' in position 573: ordinal not in range(128)
Since nobody is answering posts here, I'm going to delete it and move onto finding another way to deal with it. I don't have time to learn Python to fix it.
Works perfectly, thanks! To fix the ascii encoding errors I added:
cur_file.write(u'<{0}{1}/>'.format(
start[0].encode('ascii', 'ignore').decode('ascii'),
attrs_s(start[1]).encode('ascii', 'ignore').decode('ascii')
))
@spedy Could you please provide your code.
@spedy where did you add that code? can you provide your full source code? That would be very helpful.
Works perfectly, thanks! To fix the ascii encoding errors I added:
cur_file.write(u'<{0}{1}/>'.format(
start[0].encode('ascii', 'ignore').decode('ascii'),
attrs_s(start[1]).encode('ascii', 'ignore').decode('ascii')
))
works perfectly with this on line 106 instead of whats there now.
This works for me and save my time - thank you!
python split.py -o ./output/ -M 500000 output.xml
Thank you, this is great! Same comment as above about the encoding, but also changed in lines 92 and 106
Thanks! this worked great and saved me quite a bit of time!
Just had to make some changes for my particular encoding needs and file sizes. I'm linking to the one with my changes in case anybody finds it useful.
https://gist.github.com/scnctech/19cd6801f933e85c3c2c58048852b0de
Great job, but It gives me this:
File "xml_split.py", line 56
print "part %d Done" % cur_idx ^
SyntaxError: Missing parentheses in call to 'print'.
Did you mean print("part %d Done" % cur_idx)?
Help please?!
try my version and let me know how it goes: here
@scnctech
This man (scnctech) has resolved all these errors with his copy of this script.
It has worked perfectly in my case.
He saved my day.
Thanks alot
This is the command I used after I get to the directory (cd command) which contain both xml_split.py and my xml file (arwiktionary.xml) :
python xml_split.py arwiktionary.xml -M 40000
The output files are:
arwiktionary.0.xml
arwiktionary.1.xml
Thanks again
me know how it goes: here
This solution is work perfectly! No problem with big file chunks and encoding!
Didn't work for me. For sample xml file below, my expectation is that it should create 2 xml files and each file should have product node and one review comment node of the product. But the second file it created has product tag, but not the in it. So the second file created is not a valid xml file in my case due to missing Product\Name tag. However, Product opening and closing tags are created fine.
<?xml version='1.0' encoding='utf-8'?>
<root>
<product>
<name>Product1</name>
<review>
<id>1</id>
<comment>Magnam ipsum dolore dolorem dolorem sed adipisci sed. Dolore adipisci neque quisquam. Dolor labore dolore porro magnam. Ipsum ipsum ut neque aliquam eius labore porro. Dolor sit etincidunt sit ipsum tempora porro. Quaerat consectetur ut magnam adipisci velit.</comment>
</review>
<review>
<id>2</id>
<comment>Dolor velit etincidunt eius labore consectetur. Quaerat consectetur dolor quisquam modi adipisci numquam tempora. Est sit neque quisquam labore modi quisquam magnam. Dolorem tempora dolore neque dolorem. Est magnam voluptatem etincidunt dolor eius quaerat velit.</comment>
</review>
</product>
</root>
Output I got.
file1
<?xml version='1.0' encoding='utf-8'?>
<root>
<product>
<name>Product1</name>
<review>
<id>1</id>
<comment>Magnam ipsum dolore dolorem dolorem sed adipisci sed. Dolore adipisci neque quisquam. Dolor labore dolore porro magnam. Ipsum ipsum ut neque aliquam eius labore porro. Dolor sit etincidunt sit ipsum tempora porro. Quaerat consectetur ut magnam adipisci velit.</comment>
</review>
</product>
</root>
file2 (missing Product\Name
<?xml version='1.0' encoding='utf-8'?>
<root>
<product>
<review>
<id>2</id>
<comment>Dolor velit etincidunt eius labore consectetur. Quaerat consectetur dolor quisquam modi adipisci numquam tempora. Est sit neque quisquam labore modi quisquam magnam. Dolorem tempora dolore neque dolorem. Est magnam voluptatem etincidunt dolor eius quaerat velit.</comment>
</review>
</product>
</root>
Awesome! This script worked fine without changing a single bit of code. But my requirement is little bit different. I need to split a large XML file based on elements count. For example 5000
<Document>
per split file. I don’t want to split the file somewhere in between<Document>
tag. I am very new in python, would you please help me on this. I would really appreciate your help.Sample XML file:
Also, I would like to add the XML declaration at the top and close with root tags
<Documents>
in every split files. Regards.