Created
April 15, 2022 14:15
-
-
Save jcrist/de29815389eaed4eaf5b24fbcfdab5f0 to your computer and use it in GitHub Desktop.
A quick benchmark on querying a large JSON file in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This uses the noarch `current_repodata.json` from conda-forge, which can be found | |
# at https://conda.anaconda.org/conda-forge/noarch/current_repodata.json | |
# This file is medium in size (13 MiB), and contains a nested structure of metadata | |
# about packages on conda-forge. | |
# | |
# Here we benchmark querying the top 10 packages by size from repodata, using a number | |
# of different python JSON libraries. | |
def bench_msgspec(data: bytes) -> None: | |
from operator import attrgetter | |
import msgspec | |
class Package(msgspec.Struct, nogc=True): | |
name: str | |
size: int | |
class RepoData(msgspec.Struct, nogc=True): | |
packages: dict[str, Package] | |
repo_data = msgspec.json.decode(data, type=RepoData) | |
packages = list(repo_data.packages.values()) | |
packages.sort(key=attrgetter("size"), reverse=True) | |
return [(p.name, p.size) for p in packages[:10]] | |
def _bench_other(data, decode): | |
repo_data = decode(data) | |
packages = [(p["size"], p["name"]) for p in repo_data["packages"].values()] | |
packages.sort(reverse=True) | |
return [(name, size) for size, name in packages[:10]] | |
def bench_orjson(data: bytes) -> None: | |
import orjson | |
return _bench_other(data, orjson.loads) | |
def bench_json(data: bytes) -> None: | |
import json | |
return _bench_other(data, json.loads) | |
def bench_ujson(data: bytes) -> None: | |
import ujson | |
return _bench_other(data, ujson.loads) | |
def bench_simdjson(data: bytes) -> None: | |
import simdjson | |
return _bench_other(data, simdjson.Parser().parse) | |
benchmarks = [ | |
("msgspec", bench_msgspec), | |
("simdjson", bench_simdjson), | |
("orjson", bench_orjson), | |
("ujson", bench_ujson), | |
("json", bench_json) | |
] | |
if __name__ == "__main__": | |
import time | |
with open("current_repodata.json", "rb") as f: | |
data = f.read() | |
results = [] | |
for lib, bench in benchmarks: | |
start = time.perf_counter() | |
results.append(bench(data)) | |
stop = time.perf_counter() | |
print(f"{lib}: {(stop - start) * 1000} ms") | |
# Ensure all results are the same | |
first = results[0] | |
for r in results: | |
assert r == first |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Running this:
A few comments:
simdjson
parses a JSON blob into a proxy object (this is fast). It then lazily creates Python objects as needed as different fields are accessed. This means you only pay the cost of creating python objects for the fields you use - a query (like the one here) that only accesses a few fields runs much faster since not as many Python objects are created. The downside is every attribute access results in some indirection as new objects are created