Instantly share code, notes, and snippets.

@ionelmc /.gitignore
Last active Jun 28, 2018

Embed
What would you like to do?
A JSON parser benchmark. See README.rst for instructions.
*.cbor
*.log
*.xml
*.yaml
*.json
*.msgpack
*.html
*.marshal
*.pickle
.tox
.idea
.cache
*.pyc

A JSON parsing benchmark

Test code for http://blog.ionelmc.ro/2015/11/22/memory-use-and-speed-of-json-parsers/

Usage

To run:

tox

To run with different dataset (example: canada.json):

tox -- canada.json

For a VERY big data set download (you need tons of RAM for that): https://github.com/zemirco/sf-city-lots-json/raw/master/citylots.json

Other datasets you can try: https://github.com/miloyip/nativejson-benchmark/tree/master/data

If you only want to compare few implementations then run:

BM_IMPL='json simplejson msgpack' tox

On Windows (note that memory measurements are disabled there):

set BM_IMPL=json simplejson msgpack
tox

To bind to a different CPU (default is 0):

BM_CPU='1' tox

CPU binding helps a bit during speed tests, as code is single-threaded.

Measure memory differently: you can use ru_maxrss instead of valgrind (similar to what GNU /bin/time does with %M):

BM_MEM=maxrss tox

Implementations

Implementations that are tested by default:

  • cbor
  • cjson
  • cPickle
  • ijson
  • json
  • jsonlib
  • jsonlib2
  • jsonstreamer
  • msgpack
  • pickle
  • rapidjson
  • simplejson
  • ujson
  • yajl

Extra implementations not included in the defafult BM_IMPL:

  • marshal
  • pyjsmn
  • ruamel.yaml
  • yaml
  • lxml

To test everything plus lxml:

BM_IMPL_ADD='lxml' tox

To test just cbor and msgpack:

BM_IMPL='cbor msgpack' tox

Dependencies

pip install tox

Ubuntu

You might need to run:

apt-get install build-essential git-core python-dev python3.5-dev valgrind libyajl2 libyajl-dev

Python 3.5 is available from the Deadsnakes PPA:

apt-add-repository ppa:fkrull/deadsnakes
def pytest_benchmark_generate_json(config, benchmarks, include_data):
from pytest_benchmark.plugin import pytest_benchmark_generate_json
return pytest_benchmark_generate_json(config=config, benchmarks=benchmarks, include_data=False)
import json
import sys
import dicttoxml
import marshal
import msgpack
import yaml
import cbor
from util import is_format_needed
try:
import cPickle as pickle
except ImportError:
import pickle
if __name__ == '__main__':
if len(sys.argv) > 1:
with open(sys.argv[1]) as fh:
data = json.load(fh)
else:
data = {
"foo": [{
"bar": [
'A"\\ :,;\n1' * 20000000,
],
"b": [
1, 0.333, True,
],
"c": None,
}]
}
if is_format_needed('json'):
with open('data.json', 'w') as fh:
print('Creating data.json ...')
json.dump(data, fh)
if is_format_needed('yaml'):
with open('data.yaml', 'w') as fh:
print('Creating data.yaml ...')
yaml.dump(data, fh)
if is_format_needed('msgpack'):
with open('data.msgpack', 'wb') as fh:
print('Creating data.msgpack ...')
msgpack.dump(data, fh, use_bin_type=True)
if is_format_needed('pickle'):
with open('data.pickle', 'wb') as fh:
print('Creating data.pickle ...')
pickle.dump(data, fh, protocol=pickle.HIGHEST_PROTOCOL)
if is_format_needed('marshal'):
with open('data.marshal', 'wb') as fh:
print('Creating data.marshal ...')
marshal.dump(data, fh)
if is_format_needed('xml'):
with open('data.xml', 'wb') as fh:
print('Creating data.xml ...')
fh.write(dicttoxml.dicttoxml(data))
if is_format_needed('cbor'):
with open('data.cbor', 'wb') as fh:
print('Creating data.cbor ...')
cbor.dump(data, fh)
#!/usr/bin/env python
from __future__ import division
import argparse
import os
import sys
from util import IMPLEMENTATIONS
from util import get_loader
from util import open_data
backend = os.getenv("BM_MEM", "valgrind").lower()
def run(impl):
with open_data(impl) as fh:
data = fh.read()
loader = get_loader(impl)
loader(data)
parser = argparse.ArgumentParser()
parser.add_argument('--save')
parser.add_argument('impl', nargs="?")
if __name__ == '__main__':
args = parser.parse_args()
if args.impl:
run(args.impl)
else:
import operator
import subprocess
results = {}
for impl in IMPLEMENTATIONS:
try:
__import__(impl)
except ImportError:
continue
print('Testing memory use for %r ...' % impl)
if backend == 'valgrind':
massif_log = './massif-%s.log' % impl
subprocess.check_call([
# 'strace',
# '-o',
# './strace-%s.log' % impl,
'valgrind',
'--tool=massif',
'--massif-out-file=%s' % massif_log,
'--pages-as-heap=yes',
'--heap=yes',
'--threshold=0',
'--max-snapshots=1000',
'--peak-inaccuracy=0',
sys.executable,
__file__,
impl
])
memory = 0
with open(massif_log) as fh:
for line in fh:
if line.startswith('mem_heap_B='):
memory = max(memory, int(line.split('=')[-1]))
results[impl] = memory
elif backend == 'maxrss':
pid = os.fork()
if pid:
_, exit_code, usage = os.wait4(pid, 0)
if exit_code:
raise RuntimeError("Failed to run loader. Exit code: %s. Used: %s Mb" % (
exit_code, usage.ru_maxrss / 1024
))
results[impl] = usage.ru_maxrss * 1024
else:
try:
run(impl)
except Exception:
import traceback
traceback.print_exc()
os._exit(5)
finally:
os._exit(0)
else:
raise RuntimeError("Unknown BM_MEM backend %r" % backend)
print('MEMORY USAGE:')
for impl, memory in sorted(results.items(), key=operator.itemgetter(1)):
print('{:>20}: {:>7,.1f} Mb'.format(impl, memory / 1024 / 1024))
if args.save:
import json
dirname = os.path.dirname(args.save)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(args.save, 'w') as fh:
json.dump(results, fh)
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View Raw

(Sorry about that, but we can’t show files that are this big right now.)

View Raw

(Sorry about that, but we can’t show files that are this big right now.)

View Raw

(Sorry about that, but we can’t show files that are this big right now.)

View Raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment