uogbuji/marc-split-memprof.py

## marc-split-memprof.py
#!/usr/bin/env python
'''
Split MARC/XML file

time python -m memory_profiler scratch/marc-split-memprof.py -c 1000 -o /tmp/msplit/split largefile.mrx
'''

import sys
import argparse

import xml.parsers.expat
from xml.sax.saxutils import escape #also quoteattr?
from amara3.uxml import tree, writer, xml as amxml
from amara3.util import coroutine

from memory_profiler import profile

@profile
def run(infile=None, outfstem=None, count=1000):
    a_writer = None
    sequencer = None
    fp = None
    @coroutine
    def sink():
        nonlocal a_writer, sequencer, fp
        filecount = 0
        def new_file(filecount):
            fp = open(outfstem + str(filecount + 1) + '.mrx', 'w')
            a_writer = writer.namespacer(fp, prefixes=sequencer.handler.prefixes, mapping=sequencer.handler.ns_portfolio)
            a_writer.start_element('collection')
            filecount += 1
            return a_writer, filecount

        old_e = None
        while True:
            if old_e is not None:
                print('{} ({})'.format(sys.getrefcount(old_e), len(old_e.xml_children)), end=' / ', file=sys.stderr, flush=True)
            e = yield
            if a_writer is None:
                a_writer, filecount = new_file(filecount)
                ix = 0
            #print((filecount, ix, count))
            writer.write(e, a_writer)
            ix += 1
            if ix == count:
                a_writer.end_element('collection')
                if fp: fp.close()
                a_writer, filecount = new_file(filecount)
                ix = 0
            old_e = e

    ts = amxml.treesequence(('collection', 'record'), sink(), callbacks=amxml.ns_expat_callbacks)
    sequencer = ts
    ts.parse_file(infile)
    a_writer.end_element('collection')
    return


if __name__ == '__main__':
    #python -m nlbld.rdfsplit -o /tmp/split/ /tmp/spam.mrx
    parser = argparse.ArgumentParser(prog="marc-splitter")
    parser.add_argument('input', type=argparse.FileType('rb'), metavar='FILE', nargs=1,
                        help='MARC/XML file to be split')
    parser.add_argument('-o', '--out',
        help='filename stem for files written out. e.g. if "dir/out" files will be written as "dir/out1.mrx", "dir/out2.mrx", etc.')
    parser.add_argument('-c', '--count', metavar="NUMBER", type=int, default=1000,#dest="proxy",
        help='maximum number of top-level resource elements to include per output file')
    args = parser.parse_args()

    run(infile=args.input[0], outfstem=args.out, count=args.count)# verbose=args.verbose)
    #for f in args.inputs: f.close()
    args.input[0].close()

## marc-split-objgraph.py
#!/usr/bin/env python
'''
Split MARC/XML file

python scratch/marc-split-memprof.py -c 1 -o /tmp/msplit/z-split test/resource/gmu-10.marc.xml
'''

import sys
import argparse

import xml.parsers.expat
from xml.sax.saxutils import escape #also quoteattr?
from amara3.uxml import tree, writer, xml as amxml
from amara3.util import coroutine

import objgraph

def run(infile=None, outfstem=None, count=1000):
    a_writer = None
    sequencer = None
    fp = None
    @coroutine
    def sink():
        nonlocal a_writer, sequencer, fp
        filecount = 0
        def new_file(filecount):
            fp = open(outfstem + str(filecount + 1) + '.mrx', 'w')
            a_writer = writer.namespacer(fp, prefixes=sequencer.handler.prefixes, mapping=sequencer.handler.ns_portfolio)
            a_writer.start_element('collection')
            filecount += 1
            return a_writer, filecount

        old_e = None
        graph_printed = False
        while True:
            if old_e is not None:
                if not graph_printed: objgraph.show_backrefs([old_e], filename='/tmp/z-split.png')
                graph_printed = True
            e = yield
            if a_writer is None:
                a_writer, filecount = new_file(filecount)
                ix = 0
            #print((filecount, ix, count))
            writer.write(e, a_writer)
            ix += 1
            if ix == count:
                a_writer.end_element('collection')
                if fp: fp.close()
                a_writer, filecount = new_file(filecount)
                ix = 0
            old_e = e


    ts = amxml.treesequence(('collection', 'record'), sink(), callbacks=amxml.ns_expat_callbacks)
    sequencer = ts
    ts.parse_file(infile)
    a_writer.end_element('collection')
    return


if __name__ == '__main__':
    #python -m nlbld.rdfsplit -o /tmp/split/ /tmp/spam.mrx
    parser = argparse.ArgumentParser(prog="marc-splitter")
    parser.add_argument('input', type=argparse.FileType('rb'), metavar='FILE', nargs=1,
                        help='MARC/XML file to be split')
    parser.add_argument('-o', '--out',
        help='filename stem for files written out. e.g. if "dir/out" files will be written as "dir/out1.mrx", "dir/out2.mrx", etc.')
    parser.add_argument('-c', '--count', metavar="NUMBER", type=int, default=1000,#dest="proxy",
        help='maximum number of top-level resource elements to include per output file')
    args = parser.parse_args()

    run(infile=args.input[0], outfstem=args.out, count=args.count)# verbose=args.verbose)
    #for f in args.inputs: f.close()
    args.input[0].close()
	#!/usr/bin/env python
	'''
	Split MARC/XML file

	time python -m memory_profiler scratch/marc-split-memprof.py -c 1000 -o /tmp/msplit/split largefile.mrx
	'''

	import sys
	import argparse

	import xml.parsers.expat
	from xml.sax.saxutils import escape #also quoteattr?
	from amara3.uxml import tree, writer, xml as amxml
	from amara3.util import coroutine

	from memory_profiler import profile

	@profile
	def run(infile=None, outfstem=None, count=1000):
	a_writer = None
	sequencer = None
	fp = None
	@coroutine
	def sink():
	nonlocal a_writer, sequencer, fp
	filecount = 0
	def new_file(filecount):
	fp = open(outfstem + str(filecount + 1) + '.mrx', 'w')
	a_writer = writer.namespacer(fp, prefixes=sequencer.handler.prefixes, mapping=sequencer.handler.ns_portfolio)
	a_writer.start_element('collection')
	filecount += 1
	return a_writer, filecount

	old_e = None
	while True:
	if old_e is not None:
	print('{} ({})'.format(sys.getrefcount(old_e), len(old_e.xml_children)), end=' / ', file=sys.stderr, flush=True)
	e = yield
	if a_writer is None:
	a_writer, filecount = new_file(filecount)
	ix = 0
	#print((filecount, ix, count))
	writer.write(e, a_writer)
	ix += 1
	if ix == count:
	a_writer.end_element('collection')
	if fp: fp.close()
	a_writer, filecount = new_file(filecount)
	ix = 0
	old_e = e

	ts = amxml.treesequence(('collection', 'record'), sink(), callbacks=amxml.ns_expat_callbacks)
	sequencer = ts
	ts.parse_file(infile)
	a_writer.end_element('collection')
	return


	if __name__ == '__main__':
	#python -m nlbld.rdfsplit -o /tmp/split/ /tmp/spam.mrx
	parser = argparse.ArgumentParser(prog="marc-splitter")
	parser.add_argument('input', type=argparse.FileType('rb'), metavar='FILE', nargs=1,
	help='MARC/XML file to be split')
	parser.add_argument('-o', '--out',
	help='filename stem for files written out. e.g. if "dir/out" files will be written as "dir/out1.mrx", "dir/out2.mrx", etc.')
	parser.add_argument('-c', '--count', metavar="NUMBER", type=int, default=1000,#dest="proxy",
	help='maximum number of top-level resource elements to include per output file')
	args = parser.parse_args()

	run(infile=args.input[0], outfstem=args.out, count=args.count)# verbose=args.verbose)
	#for f in args.inputs: f.close()
	args.input[0].close()
	#!/usr/bin/env python
	'''
	Split MARC/XML file

	python scratch/marc-split-memprof.py -c 1 -o /tmp/msplit/z-split test/resource/gmu-10.marc.xml
	'''

	import sys
	import argparse

	import xml.parsers.expat
	from xml.sax.saxutils import escape #also quoteattr?
	from amara3.uxml import tree, writer, xml as amxml
	from amara3.util import coroutine

	import objgraph

	def run(infile=None, outfstem=None, count=1000):
	a_writer = None
	sequencer = None
	fp = None
	@coroutine
	def sink():
	nonlocal a_writer, sequencer, fp
	filecount = 0
	def new_file(filecount):
	fp = open(outfstem + str(filecount + 1) + '.mrx', 'w')
	a_writer = writer.namespacer(fp, prefixes=sequencer.handler.prefixes, mapping=sequencer.handler.ns_portfolio)
	a_writer.start_element('collection')
	filecount += 1
	return a_writer, filecount

	old_e = None
	graph_printed = False
	while True:
	if old_e is not None:
	if not graph_printed: objgraph.show_backrefs([old_e], filename='/tmp/z-split.png')
	graph_printed = True
	e = yield
	if a_writer is None:
	a_writer, filecount = new_file(filecount)
	ix = 0
	#print((filecount, ix, count))
	writer.write(e, a_writer)
	ix += 1
	if ix == count:
	a_writer.end_element('collection')
	if fp: fp.close()
	a_writer, filecount = new_file(filecount)
	ix = 0
	old_e = e


	ts = amxml.treesequence(('collection', 'record'), sink(), callbacks=amxml.ns_expat_callbacks)
	sequencer = ts
	ts.parse_file(infile)
	a_writer.end_element('collection')
	return


	if __name__ == '__main__':
	#python -m nlbld.rdfsplit -o /tmp/split/ /tmp/spam.mrx
	parser = argparse.ArgumentParser(prog="marc-splitter")
	parser.add_argument('input', type=argparse.FileType('rb'), metavar='FILE', nargs=1,
	help='MARC/XML file to be split')
	parser.add_argument('-o', '--out',
	help='filename stem for files written out. e.g. if "dir/out" files will be written as "dir/out1.mrx", "dir/out2.mrx", etc.')
	parser.add_argument('-c', '--count', metavar="NUMBER", type=int, default=1000,#dest="proxy",
	help='maximum number of top-level resource elements to include per output file')
	args = parser.parse_args()

	run(infile=args.input[0], outfstem=args.out, count=args.count)# verbose=args.verbose)
	#for f in args.inputs: f.close()
	args.input[0].close()