-
-
Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
"""A quick benchmark comparing the performance of: | |
- msgspec: https://github.com/jcrist/msgspec | |
- pydantic V1: https://docs.pydantic.dev/1.10/ | |
- pydantic V2: https://docs.pydantic.dev/dev-v2/ | |
The benchmark is modified from the one in the msgspec repo here: | |
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py | |
I make no claims that it's illustrative of all use cases. I wrote this up | |
mostly to get an understanding of how msgspec's performance compares with that | |
of pydantic V2. | |
""" | |
from __future__ import annotations | |
import datetime | |
import random | |
import string | |
import timeit | |
import uuid | |
from typing import List, Literal, Union, Annotated | |
import msgspec | |
import pydantic | |
import pydantic.v1 | |
def make_filesystem_data(capacity): | |
"""Generate a tree structure representing a fake filesystem""" | |
UTC = datetime.timezone.utc | |
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC) | |
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC) | |
UUIDS = [str(uuid.uuid4()) for _ in range(30)] | |
rand = random.Random(42) | |
def randdt(min, max): | |
ts = rand.randint(min.timestamp(), max.timestamp()) | |
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC) | |
def randstr(min=None, max=None): | |
if max is not None: | |
min = rand.randint(min, max) | |
return "".join(rand.choices(string.ascii_letters, k=min)) | |
def make_node(is_dir): | |
nonlocal capacity | |
name = randstr(4, 30) | |
created_by = rand.choice(UUIDS) | |
created_at = randdt(DATE_2018, DATE_2023) | |
updated_at = randdt(created_at, DATE_2023) | |
data = { | |
"type": "directory" if is_dir else "file", | |
"name": name, | |
"created_by": created_by, | |
"created_at": created_at.isoformat(), | |
"updated_at": updated_at.isoformat(), | |
} | |
if is_dir: | |
n = min(rand.randint(0, 30), capacity) | |
capacity -= n | |
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)] | |
else: | |
data["nbytes"] = rand.randint(0, 1000000) | |
return data | |
capacity -= 1 | |
out = make_node(True) | |
while capacity: | |
capacity -= 1 | |
out["contents"].append(make_node(rand.random() > 0.9)) | |
return out | |
def bench(raw_data, dumps, loads, convert): | |
msg = convert(raw_data) | |
json_data = dumps(msg) | |
msg2 = loads(json_data) | |
assert msg == msg2 | |
del msg2 | |
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg}) | |
n, t = timer.autorange() | |
dumps_time = t / n | |
timer = timeit.Timer( | |
"func(data)", setup="", globals={"func": loads, "data": json_data} | |
) | |
n, t = timer.autorange() | |
loads_time = t / n | |
return dumps_time, loads_time | |
############################################################################# | |
# msgspec # | |
############################################################################# | |
class File(msgspec.Struct, tag="file"): | |
name: Annotated[str, msgspec.Meta(min_length=1)] | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: Annotated[int, msgspec.Meta(ge=0)] | |
class Directory(msgspec.Struct, tag="directory"): | |
name: Annotated[str, msgspec.Meta(min_length=1)] | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[File, Directory]] | |
def bench_msgspec(data): | |
enc = msgspec.json.Encoder() | |
dec = msgspec.json.Decoder(Directory) | |
def convert(data): | |
return msgspec.convert(data, Directory) | |
return bench(data, enc.encode, dec.decode, convert) | |
############################################################################# | |
# pydantic V2 # | |
############################################################################# | |
class FileModel(pydantic.BaseModel): | |
type: Literal["file"] = "file" | |
name: str = pydantic.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: pydantic.NonNegativeInt | |
class DirectoryModel(pydantic.BaseModel): | |
type: Literal["directory"] = "directory" | |
name: str = pydantic.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[DirectoryModel, FileModel]] | |
def bench_pydantic_v2(data): | |
return bench( | |
data, | |
lambda p: p.model_dump_json(), | |
DirectoryModel.model_validate_json, | |
lambda data: DirectoryModel(**data), | |
) | |
############################################################################# | |
# pydantic V1 # | |
############################################################################# | |
class FileModelV1(pydantic.v1.BaseModel): | |
type: Literal["file"] = "file" | |
name: str = pydantic.v1.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: pydantic.v1.NonNegativeInt | |
class DirectoryModelV1(pydantic.v1.BaseModel): | |
type: Literal["directory"] = "directory" | |
name: str = pydantic.v1.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[DirectoryModelV1, FileModelV1]] | |
def bench_pydantic_v1(data): | |
return bench( | |
data, | |
lambda p: p.json(), | |
DirectoryModelV1.parse_raw, | |
lambda data: DirectoryModelV1(**data), | |
) | |
if __name__ == "__main__": | |
N = 1000 | |
data = make_filesystem_data(N) | |
ms_dumps, ms_loads = bench_msgspec(data) | |
ms_total = ms_dumps + ms_loads | |
title = f"msgspec {msgspec.__version__}" | |
print(title) | |
print("-" * len(title)) | |
print(f"dumps: {ms_dumps * 1e6:.1f} us") | |
print(f"loads: {ms_loads * 1e6:.1f} us") | |
print(f"total: {ms_total * 1e6:.1f} us") | |
for title, func in [ | |
(f"pydantic {pydantic.__version__}", bench_pydantic_v2), | |
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1) | |
]: | |
print() | |
print(title) | |
print("-" * len(title)) | |
dumps, loads = func(data) | |
total = dumps + loads | |
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)") | |
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)") | |
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)") |
Since a few people have asked about how msgspec's performance compares to pydantic v2, I've updated the gist above with a benchmark that works with the current pydantic V2 betas. Since pydantic v1 is available as pydantic.v1
this benchmark also compares pydantic v1.
Results:
$ python bench.py
msgspec 0.16.0
--------------
dumps: 176.2 us
loads: 487.6 us
total: 663.9 us
pydantic 2.0b3
--------------
dumps: 3667.3 us (20.8x slower)
loads: 5763.2 us (11.8x slower)
total: 9430.5 us (14.2x slower)
pydantic 1.10.9
---------------
dumps: 16389.6 us (93.0x slower)
loads: 65533.8 us (134.4x slower)
total: 81923.4 us (123.4x slower)
This benchmark is a modified version of the one in the msgspec repo. In general my benchmarks show pydantic v2 is ~15-30x slower than msgspec at JSON encoding, and ~6-15x slower at JSON decoding. Whether that matters for your specific application is workload dependent. Also note that I'm not a pydantic expert, this was mainly for my own understanding of how these libraries compare. As always, I recommend doing your own benchmarks when making technical decisions.
Definitely worth either removing pydantic v1, or using the actual package - the pydantic.v1
code is not complied with cython unlike installing pydantic==1.10.9
.
Thanks for the feedback Samuel! That's fair, although using the compiled version for V1 seems to have a minimal improvement on this benchmark:
msgspec vs pydantic V1 benchmark (using cython compiled pydantic V1 package)
from __future__ import annotations
import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated
import msgspec
import pydantic
def make_filesystem_data(capacity):
"""Generate a tree structure representing a fake filesystem"""
UTC = datetime.timezone.utc
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
UUIDS = [str(uuid.uuid4()) for _ in range(30)]
rand = random.Random(42)
def randdt(min, max):
ts = rand.randint(min.timestamp(), max.timestamp())
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)
def randstr(min=None, max=None):
if max is not None:
min = rand.randint(min, max)
return "".join(rand.choices(string.ascii_letters, k=min))
def make_node(is_dir):
nonlocal capacity
name = randstr(4, 30)
created_by = rand.choice(UUIDS)
created_at = randdt(DATE_2018, DATE_2023)
updated_at = randdt(created_at, DATE_2023)
data = {
"type": "directory" if is_dir else "file",
"name": name,
"created_by": created_by,
"created_at": created_at.isoformat(),
"updated_at": updated_at.isoformat(),
}
if is_dir:
n = min(rand.randint(0, 30), capacity)
capacity -= n
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
else:
data["nbytes"] = rand.randint(0, 1000000)
return data
capacity -= 1
out = make_node(True)
while capacity:
capacity -= 1
out["contents"].append(make_node(rand.random() > 0.9))
return out
def bench(raw_data, dumps, loads, convert):
msg = convert(raw_data)
json_data = dumps(msg)
msg2 = loads(json_data)
assert msg == msg2
del msg2
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
n, t = timer.autorange()
dumps_time = t / n
timer = timeit.Timer(
"func(data)", setup="", globals={"func": loads, "data": json_data}
)
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
#############################################################################
# msgspec #
#############################################################################
class File(msgspec.Struct, tag="file"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: Annotated[int, msgspec.Meta(ge=0)]
class Directory(msgspec.Struct, tag="directory"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[File, Directory]]
def bench_msgspec(data):
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
def convert(data):
return msgspec.convert(data, Directory)
return bench(data, enc.encode, dec.decode, convert)
#############################################################################
# pydantic V2 #
#############################################################################
class FileModel(pydantic.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.NonNegativeInt
class DirectoryModel(pydantic.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModel, FileModel]]
def bench_pydantic_v1(data):
return bench(
data,
lambda p: p.json(),
DirectoryModel.parse_raw,
lambda data: DirectoryModel(**data),
)
if __name__ == "__main__":
N = 1000
data = make_filesystem_data(N)
ms_dumps, ms_loads = bench_msgspec(data)
ms_total = ms_dumps + ms_loads
title = f"msgspec {msgspec.__version__}"
print(title)
print("-" * len(title))
print(f"dumps: {ms_dumps * 1e6:.1f} us")
print(f"loads: {ms_loads * 1e6:.1f} us")
print(f"total: {ms_total * 1e6:.1f} us")
for title, func in [
(f"pydantic {pydantic.__version__}", bench_pydantic_v1)
]:
print()
print(title)
print("-" * len(title))
dumps, loads = func(data)
total = dumps + loads
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")
Output:
msgspec 0.16.0
--------------
dumps: 178.6 us
loads: 497.8 us
total: 676.3 us
pydantic 1.10.9
---------------
dumps: 18206.4 us (102.0x slower)
loads: 55122.1 us (110.7x slower)
total: 73328.5 us (108.4x slower)
Either way, the main point of the benchmark in this gist was to compare pydantic V2 and msgspec, happy to remove v1 if it's a distraction.
Up to you, just making the observation really.
quick update on the numbers as pydantic v2 became stable:
msgspec 0.16.0
--------------
dumps: 179.3 us
loads: 477.0 us
total: 656.3 us
pydantic 2.0.1
--------------
dumps: 4292.0 us (23.9x slower)
loads: 6666.6 us (14.0x slower)
total: 10958.6 us (16.7x slower)
pydantic 1.10.11
----------------
dumps: 24176.3 us (134.8x slower)
loads: 73471.1 us (154.0x slower)
total: 97647.4 us (148.8x slower)
This is bench doesn't work on
0.33.0
versionpydantic-core
: