axiaoxin/.gitignore

## .gitignore
*.log
*.xml
*.yaml
*.json
*.msgpack
*.marshal
*.pickle
.tox
.idea
.cache
*.pyc


## README.rst

      
    Raw
  

              README.rst
            
          
    A JSON parsing benchmark

Test code for http://blog.ionelmc.ro/2015/11/22/memory-use-and-speed-of-json-parsers/

Usage

To run:
tox

To run with different dataset:
tox -- canada.json

For a VERY big data set download (you need tons of RAM for that):
https://github.com/zemirco/sf-city-lots-json/raw/master/citylots.json
Other datasets you can try:
https://github.com/miloyip/nativejson-benchmark/tree/master/data
If you only want to compare few implementations then run:
BM_IMPL='json simplejson msgpack' tox

On Windows (note that memory measurements are disabled there):
set BM_IMPL=json simplejson msgpack
tox

To bind to a different CPU (default is 0):
BM_CPU='1' tox

CPU binding helps a bit during speed tests, as code is single-threaded.
Measure memory differently: you can use ru_maxrss instead of valgrind (similar to what GNU /bin/time does with
%M):
BM_MEM=maxrss tox


Implementations

Implementations that are tested by default:

cjson
cPickle
json
jsonlib
jsonlib2
msgpack
pickle
rapidjson
simplejson
ujson
yajl

Extra implementation you can try (using BM_IMPL):

ijson
marshal
ruamel.yaml
pyyaml
lxml


Dependencies

pip install tox


Ubuntu

You might need to run:
apt-get install build-essential git-core python-dev python3.5-dev valgrind libyajl2

Python 3.5 is available from the Deadsnakes PPA:
apt-add-repository ppa:fkrull/deadsnakes


## data.py
import json
import sys

import dicttoxml
import marshal
import msgpack
import yaml

from util import is_format_needed

try:
    import cPickle as pickle
except ImportError:
    import pickle

if __name__ == '__main__':
    if len(sys.argv) > 1:
        with open(sys.argv[1]) as fh:
            data = json.load(fh)
    else:
        data = {
            "foo": [{
                "bar": [
                    'A"\\ :,;\n1' * 20000000,
                ],
                "b": [
                    1, 0.333, True,
                ],
                "c": None,
            }]
        }

    if is_format_needed('json'):
        with open('data.json', 'w') as fh:
            print('Creating data.json ...')
            json.dump(data, fh)

    if is_format_needed('yaml'):
        with open('data.yaml', 'w') as fh:
            print('Creating data.yaml ...')
            yaml.dump(data, fh)

    if is_format_needed('msgpack'):
        with open('data.msgpack', 'wb') as fh:
            print('Creating data.msgpack ...')
            msgpack.dump(data, fh, use_bin_type=True)

    if is_format_needed('pickle'):
        with open('data.pickle', 'wb') as fh:
            print('Creating data.pickle ...')
            pickle.dump(data, fh, protocol=pickle.HIGHEST_PROTOCOL)

    if is_format_needed('marshal'):
        with open('data.marshal', 'wb') as fh:
            print('Creating data.marshal ...')
            marshal.dump(data, fh)

    if is_format_needed('xml'):
        with open('data.xml', 'wb') as fh:
            print('Creating data.xml ...')
            fh.write(dicttoxml.dicttoxml(data))

## memory.py
#!/usr/bin/env python
from __future__ import division

import os

import sys

from util import IMPLEMENTATIONS
from util import get_loader
from util import open_data

backend = os.getenv("BM_MEM", "valgrind").lower()


def run(impl):
    with open_data(impl) as fh:
        data = fh.read()

    loader = get_loader(impl)
    loader(data)


if __name__ == '__main__':
    if len(sys.argv) > 1:
        run(sys.argv[1])
    else:
        import operator
        import subprocess

        results = {}
        for impl in IMPLEMENTATIONS:
            try:
                __import__(impl)
            except ImportError:
                continue

            print('Testing memory use for %r ...' % impl)
            if backend == 'valgrind':
                massif_log = './massif-%s.log' % impl
                subprocess.check_call([
                    # 'strace',
                    # '-o',
                    # './strace-%s.log' % impl,
                    'valgrind',
                    '--tool=massif',
                    '--massif-out-file=%s' % massif_log,
                    '--pages-as-heap=yes',
                    '--heap=yes',
                    '--threshold=0',
                    '--max-snapshots=1000',
                    '--peak-inaccuracy=0',
                    sys.executable,
                    __file__,
                    impl
                ])
                memory = 0
                with open(massif_log) as fh:
                    for line in fh:
                        if line.startswith('mem_heap_B='):
                            memory = max(memory, int(line.split('=')[-1]))
                results[impl] = memory
            elif backend == 'maxrss':
                pid = os.fork()
                if pid:
                    _, exit_code, usage = os.wait4(pid, 0)
                    if exit_code:
                        raise RuntimeError("Failed to run loader. Exit code: %s. Used: %s Mb" % (
                            exit_code, usage.ru_maxrss / 1024
                        ))
                    results[impl] = usage.ru_maxrss * 1024
                else:
                    try:
                        run(impl)
                    except Exception:
                        import traceback

                        traceback.print_exc()
                        os._exit(5)
                    finally:
                        os._exit(0)
            else:
                raise RuntimeError("Unknown BM_MEM backend %r" % backend)

        print('MEMORY USAGE:')
        for impl, memory in sorted(results.items(), key=operator.itemgetter(1)):
            print('{:>20}: {:>7,.1f} Mb'.format(impl, memory / 1024 / 1024))

## speed.py
import json

import pytest
from util import IMPLEMENTATIONS
from util import get_loader
from util import open_data


@pytest.fixture(params=IMPLEMENTATIONS)
def name(request):
    return request.param


@pytest.fixture
def data(name):
    with open_data(name) as fh:
        return fh.read()


@pytest.fixture
def impl(name):
    try:
        return get_loader(name)
    except ImportError as exc:
        pytest.skip("Implementation %r not available: %s" % (name, exc))


@pytest.fixture
def expected():
    with open('data.json') as fh:
        return json.load(fh)


def test_speed(benchmark, name, impl, data, expected):
    if name == 'jsonlib':
        result = benchmark(impl, data, use_float=True)
    elif name == 'msgpack':
        result = benchmark(impl, data, encoding='utf8')
    else:
        result = benchmark(impl, data)

    if name != 'lxml':
        assert result == expected

## tox.ini
[tox]
envlist = {py27,py35}-{win,nix}
skipsdist = true

[testenv]
passenv = *
platform =
    win: win
    nix: (linux|darwin)
deps =
    pytest-benchmark
    nix: manhole
    psutil

    ujson==1.33
    py27: python-cjson==1.1.0
    simplejson==3.8.1
    py27: jsonlib==1.6.1
    py27-nix: jsonlib2==1.5.2
    py27: pyrapidjson
    {py34,py35}: python-rapidjson
    msgpack-python==0.4.6
    nix: yajl==0.3.5
    pyyaml==3.11
    ruamel.yaml==0.10.12
    ;ijson==2.2
    cffi==1.3.1
    https://github.com/isagalaev/ijson/archive/d405adc7e737c1f1c7c41d762bdae1f1ae6f4906.zip
    dicttoxml==1.6.6
    lxml==3.5.0

commands =
    python data.py []
    nix: python memory.py
    py.test -vv speed.py

[testenv:data]
deps =
    msgpack-python==0.4.6

[pytest]
norecursedirs =
    .git
    .tox
    dist
    build
addopts =
    -rxEfs
    --strict
    --tb=short
    --assert=plain
# uncomment this if you want more benchmark rounds, I've used 100
# for the published results but that take inordinate amounts of time
;    --benchmark-min-rounds=100

## util.py
import os
from functools import partial
from itertools import chain

import psutil

psutil.Process().cpu_affinity(
    [int(i) for i in os.environ.get('BM_CPU', '0').split(',')]
)

if os.environ.get('BM_DEBUG'):
    try:
        import manhole
    except ImportError:
        pass
    else:
        manhole.install(oneshot_on='USR2')


IMPLEMENTATIONS = os.environ.get(
    'BM_IMPL',
    '''
    cjson
    cPickle
    json
    jsonlib
    jsonlib2
    msgpack
    pickle
    rapidjson
    simplejson
    ujson
    yajl
    '''
).split()
FORMATS = {
    'cPickle': 'pickle',
    'lxml': 'xml',
    'marshal': 'marshal',
    'msgpack': 'msgpack',
    'pickle': 'pickle',
    'ruamel.yaml': 'yaml',
    'yaml': 'yaml',
}
MODES = {
    'cPickle': 'rb',
    'marshal': 'rb',
    'msgpack': 'rb',
    'pickle': 'rb',
    'lxml': 'rb',
}


def is_format_needed(ext):
    if ext == 'json':
        return True
    for impl, impl_ext in chain(FORMATS.items()):
        if ext == impl_ext and impl in IMPLEMENTATIONS:
            return True


def open_data(kind):
    return open('data.%s' % FORMATS.get(kind, 'json'), MODES.get(kind, 'r'))


def get_loader(kind):
    if kind == 'lxml':
        from lxml import etree

        return partial(etree.fromstring, parser=etree.XMLParser(huge_tree=True))
    elif kind == 'ijson':
        try:
            from ijson.backends.yajl2_cffi import parse
        except ImportError:
            from ijson.backends.yajl2 import parse
        try:
            from cStringIO import StringIO
        except ImportError:
            from io import StringIO


        def load(payload):
            for _ in parse(StringIO(payload)):
                pass

        return load

    mod = __import__(kind)
    for part in kind.split('.')[1:]:
        mod = getattr(mod, part)

    for attr in ['loads', 'decode', 'read', 'load']:
        try:
            loader = getattr(mod, attr)
            break
        except AttributeError:
            continue
    else:
        raise NotImplementedError(kind)

    if kind == 'jsonlib':
        return partial(loader, use_float=True)
    elif kind == 'msgpack':
        return partial(loader, encoding='utf8')
    else:
        return loader
	*.log
	*.xml
	*.yaml
	*.json
	*.msgpack
	*.marshal
	*.pickle
	.tox
	.idea
	.cache
	*.pyc
	import json
	import sys

	import dicttoxml
	import marshal
	import msgpack
	import yaml

	from util import is_format_needed

	try:
	import cPickle as pickle
	except ImportError:
	import pickle

	if __name__ == '__main__':
	if len(sys.argv) > 1:
	with open(sys.argv[1]) as fh:
	data = json.load(fh)
	else:
	data = {
	"foo": [{
	"bar": [
	'A"\\ :,;\n1' * 20000000,
	],
	"b": [
	1, 0.333, True,
	],
	"c": None,
	}]
	}

	if is_format_needed('json'):
	with open('data.json', 'w') as fh:
	print('Creating data.json ...')
	json.dump(data, fh)

	if is_format_needed('yaml'):
	with open('data.yaml', 'w') as fh:
	print('Creating data.yaml ...')
	yaml.dump(data, fh)

	if is_format_needed('msgpack'):
	with open('data.msgpack', 'wb') as fh:
	print('Creating data.msgpack ...')
	msgpack.dump(data, fh, use_bin_type=True)

	if is_format_needed('pickle'):
	with open('data.pickle', 'wb') as fh:
	print('Creating data.pickle ...')
	pickle.dump(data, fh, protocol=pickle.HIGHEST_PROTOCOL)

	if is_format_needed('marshal'):
	with open('data.marshal', 'wb') as fh:
	print('Creating data.marshal ...')
	marshal.dump(data, fh)

	if is_format_needed('xml'):
	with open('data.xml', 'wb') as fh:
	print('Creating data.xml ...')
	fh.write(dicttoxml.dicttoxml(data))
	#!/usr/bin/env python
	from __future__ import division

	import os

	import sys

	from util import IMPLEMENTATIONS
	from util import get_loader
	from util import open_data

	backend = os.getenv("BM_MEM", "valgrind").lower()


	def run(impl):
	with open_data(impl) as fh:
	data = fh.read()

	loader = get_loader(impl)
	loader(data)


	if __name__ == '__main__':
	if len(sys.argv) > 1:
	run(sys.argv[1])
	else:
	import operator
	import subprocess

	results = {}
	for impl in IMPLEMENTATIONS:
	try:
	__import__(impl)
	except ImportError:
	continue

	print('Testing memory use for %r ...' % impl)
	if backend == 'valgrind':
	massif_log = './massif-%s.log' % impl
	subprocess.check_call([
	# 'strace',
	# '-o',
	# './strace-%s.log' % impl,
	'valgrind',
	'--tool=massif',
	'--massif-out-file=%s' % massif_log,
	'--pages-as-heap=yes',
	'--heap=yes',
	'--threshold=0',
	'--max-snapshots=1000',
	'--peak-inaccuracy=0',
	sys.executable,
	__file__,
	impl
	])
	memory = 0
	with open(massif_log) as fh:
	for line in fh:
	if line.startswith('mem_heap_B='):
	memory = max(memory, int(line.split('=')[-1]))
	results[impl] = memory
	elif backend == 'maxrss':
	pid = os.fork()
	if pid:
	_, exit_code, usage = os.wait4(pid, 0)
	if exit_code:
	raise RuntimeError("Failed to run loader. Exit code: %s. Used: %s Mb" % (
	exit_code, usage.ru_maxrss / 1024
	))
	results[impl] = usage.ru_maxrss * 1024
	else:
	try:
	run(impl)
	except Exception:
	import traceback

	traceback.print_exc()
	os._exit(5)
	finally:
	os._exit(0)
	else:
	raise RuntimeError("Unknown BM_MEM backend %r" % backend)

	print('MEMORY USAGE:')
	for impl, memory in sorted(results.items(), key=operator.itemgetter(1)):
	print('{:>20}: {:>7,.1f} Mb'.format(impl, memory / 1024 / 1024))
	import json

	import pytest
	from util import IMPLEMENTATIONS
	from util import get_loader
	from util import open_data


	@pytest.fixture(params=IMPLEMENTATIONS)
	def name(request):
	return request.param


	@pytest.fixture
	def data(name):
	with open_data(name) as fh:
	return fh.read()


	@pytest.fixture
	def impl(name):
	try:
	return get_loader(name)
	except ImportError as exc:
	pytest.skip("Implementation %r not available: %s" % (name, exc))


	@pytest.fixture
	def expected():
	with open('data.json') as fh:
	return json.load(fh)


	def test_speed(benchmark, name, impl, data, expected):
	if name == 'jsonlib':
	result = benchmark(impl, data, use_float=True)
	elif name == 'msgpack':
	result = benchmark(impl, data, encoding='utf8')
	else:
	result = benchmark(impl, data)

	if name != 'lxml':
	assert result == expected
	[tox]
	envlist = {py27,py35}-{win,nix}
	skipsdist = true

	[testenv]
	passenv = *
	platform =
	win: win
	nix: (linux\|darwin)
	deps =
	pytest-benchmark
	nix: manhole
	psutil

	ujson==1.33
	py27: python-cjson==1.1.0
	simplejson==3.8.1
	py27: jsonlib==1.6.1
	py27-nix: jsonlib2==1.5.2
	py27: pyrapidjson
	{py34,py35}: python-rapidjson
	msgpack-python==0.4.6
	nix: yajl==0.3.5
	pyyaml==3.11
	ruamel.yaml==0.10.12
	;ijson==2.2
	cffi==1.3.1
	https://github.com/isagalaev/ijson/archive/d405adc7e737c1f1c7c41d762bdae1f1ae6f4906.zip
	dicttoxml==1.6.6
	lxml==3.5.0

	commands =
	python data.py []
	nix: python memory.py
	py.test -vv speed.py

	[testenv:data]
	deps =
	msgpack-python==0.4.6

	[pytest]
	norecursedirs =
	.git
	.tox
	dist
	build
	addopts =
	-rxEfs
	--strict
	--tb=short
	--assert=plain
	# uncomment this if you want more benchmark rounds, I've used 100
	# for the published results but that take inordinate amounts of time
	; --benchmark-min-rounds=100
	import os
	from functools import partial
	from itertools import chain

	import psutil

	psutil.Process().cpu_affinity(
	[int(i) for i in os.environ.get('BM_CPU', '0').split(',')]
	)

	if os.environ.get('BM_DEBUG'):
	try:
	import manhole
	except ImportError:
	pass
	else:
	manhole.install(oneshot_on='USR2')



	IMPLEMENTATIONS = os.environ.get(
	'BM_IMPL',
	'''
	cjson
	cPickle
	json
	jsonlib
	jsonlib2
	msgpack
	pickle
	rapidjson
	simplejson
	ujson
	yajl
	'''
	).split()
	FORMATS = {
	'cPickle': 'pickle',
	'lxml': 'xml',
	'marshal': 'marshal',
	'msgpack': 'msgpack',
	'pickle': 'pickle',
	'ruamel.yaml': 'yaml',
	'yaml': 'yaml',
	}
	MODES = {
	'cPickle': 'rb',
	'marshal': 'rb',
	'msgpack': 'rb',
	'pickle': 'rb',
	'lxml': 'rb',
	}


	def is_format_needed(ext):
	if ext == 'json':
	return True
	for impl, impl_ext in chain(FORMATS.items()):
	if ext == impl_ext and impl in IMPLEMENTATIONS:
	return True


	def open_data(kind):
	return open('data.%s' % FORMATS.get(kind, 'json'), MODES.get(kind, 'r'))


	def get_loader(kind):
	if kind == 'lxml':
	from lxml import etree

	return partial(etree.fromstring, parser=etree.XMLParser(huge_tree=True))
	elif kind == 'ijson':
	try:
	from ijson.backends.yajl2_cffi import parse
	except ImportError:
	from ijson.backends.yajl2 import parse
	try:
	from cStringIO import StringIO
	except ImportError:
	from io import StringIO


	def load(payload):
	for _ in parse(StringIO(payload)):
	pass

	return load

	mod = __import__(kind)
	for part in kind.split('.')[1:]:
	mod = getattr(mod, part)

	for attr in ['loads', 'decode', 'read', 'load']:
	try:
	loader = getattr(mod, attr)
	break
	except AttributeError:
	continue
	else:
	raise NotImplementedError(kind)

	if kind == 'jsonlib':
	return partial(loader, use_float=True)
	elif kind == 'msgpack':
	return partial(loader, encoding='utf8')
	else:
	return loader