Skip to content

Instantly share code, notes, and snippets.

@axiaoxin
Forked from ionelmc/.gitignore
Created December 11, 2015 05:54
Show Gist options
  • Save axiaoxin/3bd16d7cb691b2cec837 to your computer and use it in GitHub Desktop.
Save axiaoxin/3bd16d7cb691b2cec837 to your computer and use it in GitHub Desktop.
A JSON parser benchmark. See README.rst for instructions.
*.log
*.xml
*.yaml
*.json
*.msgpack
*.marshal
*.pickle
.tox
.idea
.cache
*.pyc

A JSON parsing benchmark

Test code for http://blog.ionelmc.ro/2015/11/22/memory-use-and-speed-of-json-parsers/

Usage

To run:

tox

To run with different dataset:

tox -- canada.json

For a VERY big data set download (you need tons of RAM for that): https://github.com/zemirco/sf-city-lots-json/raw/master/citylots.json

Other datasets you can try: https://github.com/miloyip/nativejson-benchmark/tree/master/data

If you only want to compare few implementations then run:

BM_IMPL='json simplejson msgpack' tox

On Windows (note that memory measurements are disabled there):

set BM_IMPL=json simplejson msgpack
tox

To bind to a different CPU (default is 0):

BM_CPU='1' tox

CPU binding helps a bit during speed tests, as code is single-threaded.

Measure memory differently: you can use ru_maxrss instead of valgrind (similar to what GNU /bin/time does with %M):

BM_MEM=maxrss tox

Implementations

Implementations that are tested by default:

  • cjson
  • cPickle
  • json
  • jsonlib
  • jsonlib2
  • msgpack
  • pickle
  • rapidjson
  • simplejson
  • ujson
  • yajl

Extra implementation you can try (using BM_IMPL):

  • ijson
  • marshal
  • ruamel.yaml
  • pyyaml
  • lxml

Dependencies

pip install tox

Ubuntu

You might need to run:

apt-get install build-essential git-core python-dev python3.5-dev valgrind libyajl2

Python 3.5 is available from the Deadsnakes PPA:

apt-add-repository ppa:fkrull/deadsnakes
import json
import sys
import dicttoxml
import marshal
import msgpack
import yaml
from util import is_format_needed
try:
import cPickle as pickle
except ImportError:
import pickle
if __name__ == '__main__':
if len(sys.argv) > 1:
with open(sys.argv[1]) as fh:
data = json.load(fh)
else:
data = {
"foo": [{
"bar": [
'A"\\ :,;\n1' * 20000000,
],
"b": [
1, 0.333, True,
],
"c": None,
}]
}
if is_format_needed('json'):
with open('data.json', 'w') as fh:
print('Creating data.json ...')
json.dump(data, fh)
if is_format_needed('yaml'):
with open('data.yaml', 'w') as fh:
print('Creating data.yaml ...')
yaml.dump(data, fh)
if is_format_needed('msgpack'):
with open('data.msgpack', 'wb') as fh:
print('Creating data.msgpack ...')
msgpack.dump(data, fh, use_bin_type=True)
if is_format_needed('pickle'):
with open('data.pickle', 'wb') as fh:
print('Creating data.pickle ...')
pickle.dump(data, fh, protocol=pickle.HIGHEST_PROTOCOL)
if is_format_needed('marshal'):
with open('data.marshal', 'wb') as fh:
print('Creating data.marshal ...')
marshal.dump(data, fh)
if is_format_needed('xml'):
with open('data.xml', 'wb') as fh:
print('Creating data.xml ...')
fh.write(dicttoxml.dicttoxml(data))
#!/usr/bin/env python
from __future__ import division
import os
import sys
from util import IMPLEMENTATIONS
from util import get_loader
from util import open_data
backend = os.getenv("BM_MEM", "valgrind").lower()
def run(impl):
with open_data(impl) as fh:
data = fh.read()
loader = get_loader(impl)
loader(data)
if __name__ == '__main__':
if len(sys.argv) > 1:
run(sys.argv[1])
else:
import operator
import subprocess
results = {}
for impl in IMPLEMENTATIONS:
try:
__import__(impl)
except ImportError:
continue
print('Testing memory use for %r ...' % impl)
if backend == 'valgrind':
massif_log = './massif-%s.log' % impl
subprocess.check_call([
# 'strace',
# '-o',
# './strace-%s.log' % impl,
'valgrind',
'--tool=massif',
'--massif-out-file=%s' % massif_log,
'--pages-as-heap=yes',
'--heap=yes',
'--threshold=0',
'--max-snapshots=1000',
'--peak-inaccuracy=0',
sys.executable,
__file__,
impl
])
memory = 0
with open(massif_log) as fh:
for line in fh:
if line.startswith('mem_heap_B='):
memory = max(memory, int(line.split('=')[-1]))
results[impl] = memory
elif backend == 'maxrss':
pid = os.fork()
if pid:
_, exit_code, usage = os.wait4(pid, 0)
if exit_code:
raise RuntimeError("Failed to run loader. Exit code: %s. Used: %s Mb" % (
exit_code, usage.ru_maxrss / 1024
))
results[impl] = usage.ru_maxrss * 1024
else:
try:
run(impl)
except Exception:
import traceback
traceback.print_exc()
os._exit(5)
finally:
os._exit(0)
else:
raise RuntimeError("Unknown BM_MEM backend %r" % backend)
print('MEMORY USAGE:')
for impl, memory in sorted(results.items(), key=operator.itemgetter(1)):
print('{:>20}: {:>7,.1f} Mb'.format(impl, memory / 1024 / 1024))
import json
import pytest
from util import IMPLEMENTATIONS
from util import get_loader
from util import open_data
@pytest.fixture(params=IMPLEMENTATIONS)
def name(request):
return request.param
@pytest.fixture
def data(name):
with open_data(name) as fh:
return fh.read()
@pytest.fixture
def impl(name):
try:
return get_loader(name)
except ImportError as exc:
pytest.skip("Implementation %r not available: %s" % (name, exc))
@pytest.fixture
def expected():
with open('data.json') as fh:
return json.load(fh)
def test_speed(benchmark, name, impl, data, expected):
if name == 'jsonlib':
result = benchmark(impl, data, use_float=True)
elif name == 'msgpack':
result = benchmark(impl, data, encoding='utf8')
else:
result = benchmark(impl, data)
if name != 'lxml':
assert result == expected
[tox]
envlist = {py27,py35}-{win,nix}
skipsdist = true
[testenv]
passenv = *
platform =
win: win
nix: (linux|darwin)
deps =
pytest-benchmark
nix: manhole
psutil
ujson==1.33
py27: python-cjson==1.1.0
simplejson==3.8.1
py27: jsonlib==1.6.1
py27-nix: jsonlib2==1.5.2
py27: pyrapidjson
{py34,py35}: python-rapidjson
msgpack-python==0.4.6
nix: yajl==0.3.5
pyyaml==3.11
ruamel.yaml==0.10.12
;ijson==2.2
cffi==1.3.1
https://github.com/isagalaev/ijson/archive/d405adc7e737c1f1c7c41d762bdae1f1ae6f4906.zip
dicttoxml==1.6.6
lxml==3.5.0
commands =
python data.py []
nix: python memory.py
py.test -vv speed.py
[testenv:data]
deps =
msgpack-python==0.4.6
[pytest]
norecursedirs =
.git
.tox
dist
build
addopts =
-rxEfs
--strict
--tb=short
--assert=plain
# uncomment this if you want more benchmark rounds, I've used 100
# for the published results but that take inordinate amounts of time
; --benchmark-min-rounds=100
import os
from functools import partial
from itertools import chain
import psutil
psutil.Process().cpu_affinity(
[int(i) for i in os.environ.get('BM_CPU', '0').split(',')]
)
if os.environ.get('BM_DEBUG'):
try:
import manhole
except ImportError:
pass
else:
manhole.install(oneshot_on='USR2')
IMPLEMENTATIONS = os.environ.get(
'BM_IMPL',
'''
cjson
cPickle
json
jsonlib
jsonlib2
msgpack
pickle
rapidjson
simplejson
ujson
yajl
'''
).split()
FORMATS = {
'cPickle': 'pickle',
'lxml': 'xml',
'marshal': 'marshal',
'msgpack': 'msgpack',
'pickle': 'pickle',
'ruamel.yaml': 'yaml',
'yaml': 'yaml',
}
MODES = {
'cPickle': 'rb',
'marshal': 'rb',
'msgpack': 'rb',
'pickle': 'rb',
'lxml': 'rb',
}
def is_format_needed(ext):
if ext == 'json':
return True
for impl, impl_ext in chain(FORMATS.items()):
if ext == impl_ext and impl in IMPLEMENTATIONS:
return True
def open_data(kind):
return open('data.%s' % FORMATS.get(kind, 'json'), MODES.get(kind, 'r'))
def get_loader(kind):
if kind == 'lxml':
from lxml import etree
return partial(etree.fromstring, parser=etree.XMLParser(huge_tree=True))
elif kind == 'ijson':
try:
from ijson.backends.yajl2_cffi import parse
except ImportError:
from ijson.backends.yajl2 import parse
try:
from cStringIO import StringIO
except ImportError:
from io import StringIO
def load(payload):
for _ in parse(StringIO(payload)):
pass
return load
mod = __import__(kind)
for part in kind.split('.')[1:]:
mod = getattr(mod, part)
for attr in ['loads', 'decode', 'read', 'load']:
try:
loader = getattr(mod, attr)
break
except AttributeError:
continue
else:
raise NotImplementedError(kind)
if kind == 'jsonlib':
return partial(loader, use_float=True)
elif kind == 'msgpack':
return partial(loader, encoding='utf8')
else:
return loader
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment