public
Last active

Writing a large XML file using SAX

  • Download Gist
large_xml.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
import argparse
import tempfile
import shutil
import subprocess
import lxml.etree
from lxml.builder import ElementMaker
from xml.sax import parse
from xml.sax.saxutils import XMLGenerator
from xml.sax.handler import ContentHandler
 
 
def generate_large_xml(f, size_mb):
E = ElementMaker()
doc = E.One(E.Two(E.Three(E.Four())))
doc_str = lxml.etree.tostring(doc, pretty_print=True).decode('ascii')
start, end = doc_str.split('<Four/>')
 
payload = E.Payload('big data here!!\n' * 62 + 'done ... ..\n')
payload_str = lxml.etree.tostring(payload).decode('ascii') + '\n'
 
f.write(start.encode('ascii'))
for c in range(1024 * size_mb):
f.write(payload_str.encode('ascii'))
f.write(end.encode('ascii'))
 
 
def main():
parser = argparse.ArgumentParser()
parser.add_argument('size_mb', type=int)
parser.add_argument('--no-write', action='store_true')
args = parser.parse_args()
 
tmp = tempfile.mkdtemp()
 
try:
with open(tmp + '/large.xml', 'wb') as f:
generate_large_xml(f, args.size_mb)
 
with open(tmp + '/large.xml', 'rb') as in_file:
with open(tmp + '/out.xml', 'wb') as out_file:
if args.no_write:
handler = ContentHandler()
else:
handler = XMLGenerator(out_file, encoding='utf-8')
parse(in_file, handler)
 
subprocess.check_call(['ls', '-l', tmp])
 
finally:
shutil.rmtree(tmp)
 
 
if __name__ == '__main__':
main()
shell.txt
1 2 3 4 5 6 7 8 9 10 11
$ time python large_xml.py 10
total 20488
-rw-r--r-- 1 alexm staff 10485822 Oct 21 21:41 large.xml
-rw-r--r-- 1 alexm staff 10485860 Oct 21 21:42 out.xml
python large_xml.py 10 7.19s user 0.07s system 99% cpu 7.277 total
 
$ time python large_xml.py 10 --no-write
total 10244
-rw-r--r-- 1 alexm staff 10485822 Oct 21 21:42 large.xml
-rw-r--r-- 1 alexm staff 0 Oct 21 21:42 out.xml
python large_xml.py 10 --no-write 0.61s user 0.04s system 98% cpu 0.660 total

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.