Skip to content

Instantly share code, notes, and snippets.

@uogbuji
Last active August 29, 2015 14:20
Show Gist options
  • Save uogbuji/bccdb1e2fdbb7bb88459 to your computer and use it in GitHub Desktop.
Save uogbuji/bccdb1e2fdbb7bb88459 to your computer and use it in GitHub Desktop.
Split MARC/XML, used as memory leak test for amara.uxml.tree
#!/usr/bin/env python
'''
Split MARC/XML file
time python -m memory_profiler scratch/marc-split-memprof.py -c 1000 -o /tmp/msplit/split largefile.mrx
'''
import sys
import argparse
import xml.parsers.expat
from xml.sax.saxutils import escape #also quoteattr?
from amara3.uxml import tree, writer, xml as amxml
from amara3.util import coroutine
from memory_profiler import profile
@profile
def run(infile=None, outfstem=None, count=1000):
a_writer = None
sequencer = None
fp = None
@coroutine
def sink():
nonlocal a_writer, sequencer, fp
filecount = 0
def new_file(filecount):
fp = open(outfstem + str(filecount + 1) + '.mrx', 'w')
a_writer = writer.namespacer(fp, prefixes=sequencer.handler.prefixes, mapping=sequencer.handler.ns_portfolio)
a_writer.start_element('collection')
filecount += 1
return a_writer, filecount
old_e = None
while True:
if old_e is not None:
print('{} ({})'.format(sys.getrefcount(old_e), len(old_e.xml_children)), end=' / ', file=sys.stderr, flush=True)
e = yield
if a_writer is None:
a_writer, filecount = new_file(filecount)
ix = 0
#print((filecount, ix, count))
writer.write(e, a_writer)
ix += 1
if ix == count:
a_writer.end_element('collection')
if fp: fp.close()
a_writer, filecount = new_file(filecount)
ix = 0
old_e = e
ts = amxml.treesequence(('collection', 'record'), sink(), callbacks=amxml.ns_expat_callbacks)
sequencer = ts
ts.parse_file(infile)
a_writer.end_element('collection')
return
if __name__ == '__main__':
#python -m nlbld.rdfsplit -o /tmp/split/ /tmp/spam.mrx
parser = argparse.ArgumentParser(prog="marc-splitter")
parser.add_argument('input', type=argparse.FileType('rb'), metavar='FILE', nargs=1,
help='MARC/XML file to be split')
parser.add_argument('-o', '--out',
help='filename stem for files written out. e.g. if "dir/out" files will be written as "dir/out1.mrx", "dir/out2.mrx", etc.')
parser.add_argument('-c', '--count', metavar="NUMBER", type=int, default=1000,#dest="proxy",
help='maximum number of top-level resource elements to include per output file')
args = parser.parse_args()
run(infile=args.input[0], outfstem=args.out, count=args.count)# verbose=args.verbose)
#for f in args.inputs: f.close()
args.input[0].close()
#!/usr/bin/env python
'''
Split MARC/XML file
python scratch/marc-split-memprof.py -c 1 -o /tmp/msplit/z-split test/resource/gmu-10.marc.xml
'''
import sys
import argparse
import xml.parsers.expat
from xml.sax.saxutils import escape #also quoteattr?
from amara3.uxml import tree, writer, xml as amxml
from amara3.util import coroutine
import objgraph
def run(infile=None, outfstem=None, count=1000):
a_writer = None
sequencer = None
fp = None
@coroutine
def sink():
nonlocal a_writer, sequencer, fp
filecount = 0
def new_file(filecount):
fp = open(outfstem + str(filecount + 1) + '.mrx', 'w')
a_writer = writer.namespacer(fp, prefixes=sequencer.handler.prefixes, mapping=sequencer.handler.ns_portfolio)
a_writer.start_element('collection')
filecount += 1
return a_writer, filecount
old_e = None
graph_printed = False
while True:
if old_e is not None:
if not graph_printed: objgraph.show_backrefs([old_e], filename='/tmp/z-split.png')
graph_printed = True
e = yield
if a_writer is None:
a_writer, filecount = new_file(filecount)
ix = 0
#print((filecount, ix, count))
writer.write(e, a_writer)
ix += 1
if ix == count:
a_writer.end_element('collection')
if fp: fp.close()
a_writer, filecount = new_file(filecount)
ix = 0
old_e = e
ts = amxml.treesequence(('collection', 'record'), sink(), callbacks=amxml.ns_expat_callbacks)
sequencer = ts
ts.parse_file(infile)
a_writer.end_element('collection')
return
if __name__ == '__main__':
#python -m nlbld.rdfsplit -o /tmp/split/ /tmp/spam.mrx
parser = argparse.ArgumentParser(prog="marc-splitter")
parser.add_argument('input', type=argparse.FileType('rb'), metavar='FILE', nargs=1,
help='MARC/XML file to be split')
parser.add_argument('-o', '--out',
help='filename stem for files written out. e.g. if "dir/out" files will be written as "dir/out1.mrx", "dir/out2.mrx", etc.')
parser.add_argument('-c', '--count', metavar="NUMBER", type=int, default=1000,#dest="proxy",
help='maximum number of top-level resource elements to include per output file')
args = parser.parse_args()
run(infile=args.input[0], outfstem=args.out, count=args.count)# verbose=args.verbose)
#for f in args.inputs: f.close()
args.input[0].close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment