ionelmc/.gitignore

## .gitignore
*.cbor
*.log
*.xml
*.yaml
*.json
*.msgpack
*.html
*.marshal
*.pickle
.tox
.idea
.cache
*.pyc


## README.rst

      
    Raw
  

              README.rst
            
          
    A JSON parsing benchmark

Test code for http://blog.ionelmc.ro/2015/11/22/memory-use-and-speed-of-json-parsers/
Usage

To run:
tox
To run with different dataset (example: canada.json):
tox -- canada.json
For a VERY big data set download (you need tons of RAM for that): https://github.com/zemirco/sf-city-lots-json/raw/master/citylots.json
Other datasets you can try: https://github.com/miloyip/nativejson-benchmark/tree/master/data
If you only want to compare few implementations then run:
BM_IMPL='json simplejson msgpack' tox
On Windows (note that memory measurements are disabled there):
set BM_IMPL=json simplejson msgpack
tox
To bind to a different CPU (default is 0):
BM_CPU='1' tox
CPU binding helps a bit during speed tests, as code is single-threaded.
Measure memory differently: you can use ru_maxrss instead of valgrind (similar to what GNU /bin/time does with %M):
BM_MEM=maxrss tox
Implementations

Implementations that are tested by default:

cbor
cjson
cPickle
ijson
json
jsonlib
jsonlib2
jsonstreamer
msgpack
pickle
rapidjson
simplejson
ujson
yajl

Extra implementations not included in the defafult BM_IMPL:

marshal
pyjsmn
ruamel.yaml
yaml
lxml

To test everything plus lxml:
BM_IMPL_ADD='lxml' tox
To test just cbor and msgpack:
BM_IMPL='cbor msgpack' tox
Dependencies

pip install tox
Ubuntu

You might need to run:
apt-get install build-essential git-core python-dev python3.5-dev valgrind libyajl2 libyajl-dev
Python 3.5 is available from the Deadsnakes PPA:
apt-add-repository ppa:fkrull/deadsnakes

  
## conftest.py
def pytest_benchmark_generate_json(config, benchmarks, include_data):
    from pytest_benchmark.plugin import pytest_benchmark_generate_json

    return pytest_benchmark_generate_json(config=config, benchmarks=benchmarks, include_data=False)

## data.py
import json
import sys

import dicttoxml
import marshal
import msgpack
import yaml
import cbor

from util import is_format_needed

try:
    import cPickle as pickle
except ImportError:
    import pickle

if __name__ == '__main__':
    if len(sys.argv) > 1:
        with open(sys.argv[1]) as fh:
            data = json.load(fh)
    else:
        data = {
            "foo": [{
                "bar": [
                    'A"\\ :,;\n1' * 20000000,
                ],
                "b": [
                    1, 0.333, True,
                ],
                "c": None,
            }]
        }

    if is_format_needed('json'):
        with open('data.json', 'w') as fh:
            print('Creating data.json ...')
            json.dump(data, fh)

    if is_format_needed('yaml'):
        with open('data.yaml', 'w') as fh:
            print('Creating data.yaml ...')
            yaml.dump(data, fh)

    if is_format_needed('msgpack'):
        with open('data.msgpack', 'wb') as fh:
            print('Creating data.msgpack ...')
            msgpack.dump(data, fh, use_bin_type=True)

    if is_format_needed('pickle'):
        with open('data.pickle', 'wb') as fh:
            print('Creating data.pickle ...')
            pickle.dump(data, fh, protocol=pickle.HIGHEST_PROTOCOL)

    if is_format_needed('marshal'):
        with open('data.marshal', 'wb') as fh:
            print('Creating data.marshal ...')
            marshal.dump(data, fh)

    if is_format_needed('xml'):
        with open('data.xml', 'wb') as fh:
            print('Creating data.xml ...')
            fh.write(dicttoxml.dicttoxml(data))

    if is_format_needed('cbor'):
        with open('data.cbor', 'wb') as fh:
            print('Creating data.cbor ...')
            cbor.dump(data, fh)

## memory.py
#!/usr/bin/env python
from __future__ import division

import argparse
import os
import sys

from util import IMPLEMENTATIONS
from util import get_loader
from util import open_data

backend = os.getenv("BM_MEM", "valgrind").lower()


def run(impl):
    with open_data(impl) as fh:
        data = fh.read()

    loader = get_loader(impl)
    loader(data)

parser = argparse.ArgumentParser()
parser.add_argument('--save')
parser.add_argument('impl', nargs="?")

if __name__ == '__main__':
    args = parser.parse_args()
    if args.impl:
        run(args.impl)
    else:
        import operator
        import subprocess

        results = {}
        for impl in IMPLEMENTATIONS:
            try:
                __import__(impl)
            except ImportError:
                continue

            print('Testing memory use for %r ...' % impl)
            if backend == 'valgrind':
                massif_log = './massif-%s.log' % impl
                subprocess.check_call([
                    # 'strace',
                    # '-o',
                    # './strace-%s.log' % impl,
                    'valgrind',
                    '--tool=massif',
                    '--massif-out-file=%s' % massif_log,
                    '--pages-as-heap=yes',
                    '--heap=yes',
                    '--threshold=0',
                    '--max-snapshots=1000',
                    '--peak-inaccuracy=0',
                    sys.executable,
                    __file__,
                    impl
                ])
                memory = 0
                with open(massif_log) as fh:
                    for line in fh:
                        if line.startswith('mem_heap_B='):
                            memory = max(memory, int(line.split('=')[-1]))
                results[impl] = memory
            elif backend == 'maxrss':
                pid = os.fork()
                if pid:
                    _, exit_code, usage = os.wait4(pid, 0)
                    if exit_code:
                        raise RuntimeError("Failed to run loader. Exit code: %s. Used: %s Mb" % (
                            exit_code, usage.ru_maxrss / 1024
                        ))
                    results[impl] = usage.ru_maxrss * 1024
                else:
                    try:
                        run(impl)
                    except Exception:
                        import traceback

                        traceback.print_exc()
                        os._exit(5)
                    finally:
                        os._exit(0)
            else:
                raise RuntimeError("Unknown BM_MEM backend %r" % backend)

        print('MEMORY USAGE:')
        for impl, memory in sorted(results.items(), key=operator.itemgetter(1)):
            print('{:>20}: {:>7,.1f} Mb'.format(impl, memory / 1024 / 1024))

        if args.save:
            import json

            dirname = os.path.dirname(args.save)
            if not os.path.exists(dirname):
                os.makedirs(dirname)

            with open(args.save, 'w') as fh:
                json.dump(results, fh)

## plot.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              plot.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## plot.py

      
    Raw
  

              plot.py
            
          
            View raw
              (Sorry about that, but we can’t show files that are this big right now.)
        
    
## speed.py

      
    Raw
  

              speed.py
            
          
            View raw
              (Sorry about that, but we can’t show files that are this big right now.)
        
    
## tox.ini

      
    Raw
  

              tox.ini
            
          
            View raw
              (Sorry about that, but we can’t show files that are this big right now.)
        
    
## util.py

      
    Raw
  

              util.py
            
          
            View raw
              (Sorry about that, but we can’t show files that are this big right now.)
	*.cbor
	*.log
	*.xml
	*.yaml
	*.json
	*.msgpack
	*.html
	*.marshal
	*.pickle
	.tox
	.idea
	.cache
	*.pyc
	def pytest_benchmark_generate_json(config, benchmarks, include_data):
	from pytest_benchmark.plugin import pytest_benchmark_generate_json

	return pytest_benchmark_generate_json(config=config, benchmarks=benchmarks, include_data=False)
	import json
	import sys

	import dicttoxml
	import marshal
	import msgpack
	import yaml
	import cbor

	from util import is_format_needed

	try:
	import cPickle as pickle
	except ImportError:
	import pickle

	if __name__ == '__main__':
	if len(sys.argv) > 1:
	with open(sys.argv[1]) as fh:
	data = json.load(fh)
	else:
	data = {
	"foo": [{
	"bar": [
	'A"\\ :,;\n1' * 20000000,
	],
	"b": [
	1, 0.333, True,
	],
	"c": None,
	}]
	}

	if is_format_needed('json'):
	with open('data.json', 'w') as fh:
	print('Creating data.json ...')
	json.dump(data, fh)

	if is_format_needed('yaml'):
	with open('data.yaml', 'w') as fh:
	print('Creating data.yaml ...')
	yaml.dump(data, fh)

	if is_format_needed('msgpack'):
	with open('data.msgpack', 'wb') as fh:
	print('Creating data.msgpack ...')
	msgpack.dump(data, fh, use_bin_type=True)

	if is_format_needed('pickle'):
	with open('data.pickle', 'wb') as fh:
	print('Creating data.pickle ...')
	pickle.dump(data, fh, protocol=pickle.HIGHEST_PROTOCOL)

	if is_format_needed('marshal'):
	with open('data.marshal', 'wb') as fh:
	print('Creating data.marshal ...')
	marshal.dump(data, fh)

	if is_format_needed('xml'):
	with open('data.xml', 'wb') as fh:
	print('Creating data.xml ...')
	fh.write(dicttoxml.dicttoxml(data))

	if is_format_needed('cbor'):
	with open('data.cbor', 'wb') as fh:
	print('Creating data.cbor ...')
	cbor.dump(data, fh)
	#!/usr/bin/env python
	from __future__ import division

	import argparse
	import os
	import sys

	from util import IMPLEMENTATIONS
	from util import get_loader
	from util import open_data

	backend = os.getenv("BM_MEM", "valgrind").lower()


	def run(impl):
	with open_data(impl) as fh:
	data = fh.read()

	loader = get_loader(impl)
	loader(data)

	parser = argparse.ArgumentParser()
	parser.add_argument('--save')
	parser.add_argument('impl', nargs="?")

	if __name__ == '__main__':
	args = parser.parse_args()
	if args.impl:
	run(args.impl)
	else:
	import operator
	import subprocess

	results = {}
	for impl in IMPLEMENTATIONS:
	try:
	__import__(impl)
	except ImportError:
	continue

	print('Testing memory use for %r ...' % impl)
	if backend == 'valgrind':
	massif_log = './massif-%s.log' % impl
	subprocess.check_call([
	# 'strace',
	# '-o',
	# './strace-%s.log' % impl,
	'valgrind',
	'--tool=massif',
	'--massif-out-file=%s' % massif_log,
	'--pages-as-heap=yes',
	'--heap=yes',
	'--threshold=0',
	'--max-snapshots=1000',
	'--peak-inaccuracy=0',
	sys.executable,
	__file__,
	impl
	])
	memory = 0
	with open(massif_log) as fh:
	for line in fh:
	if line.startswith('mem_heap_B='):
	memory = max(memory, int(line.split('=')[-1]))
	results[impl] = memory
	elif backend == 'maxrss':
	pid = os.fork()
	if pid:
	_, exit_code, usage = os.wait4(pid, 0)
	if exit_code:
	raise RuntimeError("Failed to run loader. Exit code: %s. Used: %s Mb" % (
	exit_code, usage.ru_maxrss / 1024
	))
	results[impl] = usage.ru_maxrss * 1024
	else:
	try:
	run(impl)
	except Exception:
	import traceback

	traceback.print_exc()
	os._exit(5)
	finally:
	os._exit(0)
	else:
	raise RuntimeError("Unknown BM_MEM backend %r" % backend)

	print('MEMORY USAGE:')
	for impl, memory in sorted(results.items(), key=operator.itemgetter(1)):
	print('{:>20}: {:>7,.1f} Mb'.format(impl, memory / 1024 / 1024))

	if args.save:
	import json

	dirname = os.path.dirname(args.save)
	if not os.path.exists(dirname):
	os.makedirs(dirname)

	with open(args.save, 'w') as fh:
	json.dump(results, fh)