Skip to content

Instantly share code, notes, and snippets.

@jcrist
Last active January 11, 2024 14:34
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
A quick benchmark comparing msgspec (https://github.com/jcrist/msgspec), pydantic v1, and pydantic v2
"""A quick benchmark comparing the performance of:
- msgspec: https://github.com/jcrist/msgspec
- pydantic V1: https://docs.pydantic.dev/1.10/
- pydantic V2: https://docs.pydantic.dev/dev-v2/
The benchmark is modified from the one in the msgspec repo here:
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py
I make no claims that it's illustrative of all use cases. I wrote this up
mostly to get an understanding of how msgspec's performance compares with that
of pydantic V2.
"""
from __future__ import annotations
import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated
import msgspec
import pydantic
import pydantic.v1
def make_filesystem_data(capacity):
"""Generate a tree structure representing a fake filesystem"""
UTC = datetime.timezone.utc
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
UUIDS = [str(uuid.uuid4()) for _ in range(30)]
rand = random.Random(42)
def randdt(min, max):
ts = rand.randint(min.timestamp(), max.timestamp())
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)
def randstr(min=None, max=None):
if max is not None:
min = rand.randint(min, max)
return "".join(rand.choices(string.ascii_letters, k=min))
def make_node(is_dir):
nonlocal capacity
name = randstr(4, 30)
created_by = rand.choice(UUIDS)
created_at = randdt(DATE_2018, DATE_2023)
updated_at = randdt(created_at, DATE_2023)
data = {
"type": "directory" if is_dir else "file",
"name": name,
"created_by": created_by,
"created_at": created_at.isoformat(),
"updated_at": updated_at.isoformat(),
}
if is_dir:
n = min(rand.randint(0, 30), capacity)
capacity -= n
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
else:
data["nbytes"] = rand.randint(0, 1000000)
return data
capacity -= 1
out = make_node(True)
while capacity:
capacity -= 1
out["contents"].append(make_node(rand.random() > 0.9))
return out
def bench(raw_data, dumps, loads, convert):
msg = convert(raw_data)
json_data = dumps(msg)
msg2 = loads(json_data)
assert msg == msg2
del msg2
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
n, t = timer.autorange()
dumps_time = t / n
timer = timeit.Timer(
"func(data)", setup="", globals={"func": loads, "data": json_data}
)
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
#############################################################################
# msgspec #
#############################################################################
class File(msgspec.Struct, tag="file"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: Annotated[int, msgspec.Meta(ge=0)]
class Directory(msgspec.Struct, tag="directory"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[File, Directory]]
def bench_msgspec(data):
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
def convert(data):
return msgspec.convert(data, Directory)
return bench(data, enc.encode, dec.decode, convert)
#############################################################################
# pydantic V2 #
#############################################################################
class FileModel(pydantic.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.NonNegativeInt
class DirectoryModel(pydantic.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModel, FileModel]]
def bench_pydantic_v2(data):
return bench(
data,
lambda p: p.model_dump_json(),
DirectoryModel.model_validate_json,
lambda data: DirectoryModel(**data),
)
#############################################################################
# pydantic V1 #
#############################################################################
class FileModelV1(pydantic.v1.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.v1.NonNegativeInt
class DirectoryModelV1(pydantic.v1.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModelV1, FileModelV1]]
def bench_pydantic_v1(data):
return bench(
data,
lambda p: p.json(),
DirectoryModelV1.parse_raw,
lambda data: DirectoryModelV1(**data),
)
if __name__ == "__main__":
N = 1000
data = make_filesystem_data(N)
ms_dumps, ms_loads = bench_msgspec(data)
ms_total = ms_dumps + ms_loads
title = f"msgspec {msgspec.__version__}"
print(title)
print("-" * len(title))
print(f"dumps: {ms_dumps * 1e6:.1f} us")
print(f"loads: {ms_loads * 1e6:.1f} us")
print(f"total: {ms_total * 1e6:.1f} us")
for title, func in [
(f"pydantic {pydantic.__version__}", bench_pydantic_v2),
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1)
]:
print()
print(title)
print("-" * len(title))
dumps, loads = func(data)
total = dumps + loads
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")
@samuelcolvin
Copy link

(sorry for the slow reply, I somehow missed the notification about your response)

Can you point out what I'm doing wrong?

Nothing exactly "wrong", but you can remove "return_fields_set": True, since we don't need the info on which fields have been set in this case. That should improve performance a bit.

I'm curious - I thought you were already using serde-json here to skip the extra object allocation? What more are you hoping to achieve here?

Currently we decode the JSON into a Value (I have a custom implementation, but it's very close to serde-json's vanilla Value), then we pass that value to the validator which performs the validation. This means:

  • we have to do all the JSON parsing, and allocate maps and arrays before we can start decoding
  • we don't know anything about the field type when parsing the next JSON object

(I assume, msgspec can avoid these allocations and also use information about the field type to take some shortcuts during validation?)

My plan (very rough at the moment) is provide an implementation of pydantic-core's Input which is internally parsing JSON as its methods are called, this should avoid allocation (except in the case of Unions) and allow for lots of optimisations in parsing.

I'm also curious threading and whether I can efficiently separate the tasks of parsing and validation into separate threads to give greater speedup. What do you think?

I think much of this would theoretically be possible with serde-json and its Visitor, but since serde-json doesn't provide line-numbers in a way pydantic-core can use, I think I'll have to build my own library anyway, so I'm putting off further optimisations until I have time to do it properly.

@jcrist
Copy link
Author

jcrist commented Aug 4, 2022

Nothing exactly "wrong", but you can remove "return_fields_set": True, since we don't need the info on which fields have been set in this case. That should improve performance a bit.

Thanks. I've updated the benchmark again, final results (based on pydantic-core commit 8f60642e2767c2b17b8d558500f0650942599ebb):

$ python bench.py
msgspec objects:     504.59 μs
msgspec dicts:       636.33 μs (1.3x slower)
pydantic v2 dicts:   2408.97 μs (4.8x slower)
pydantic v2 objects: 3462.42 μs (6.9x slower)
pydantic v1 objects: 17243.01 μs (34.2x slower)
pydantic v1 dicts:   43185.05 μs (85.6x slower)

(I assume, msgspec can avoid these allocations and also use information about the field type to take some shortcuts during validation?)

This is correct. msgspec does the minimal amount of work required when decoding - at this point most of the bottleneck in any decode step is in cpython allocation costs (sometimes up to 80% of the time). We even have a bunch of hacks to avoid allocating if unnecessary. I don't think we can get much faster here.

I'm also curious threading and whether I can efficiently separate the tasks of parsing and validation into separate threads to give greater speedup. What do you think?

Given that msgspec is mostly bound by cpython allocation/initialization (which requires the GIL), I'm not sure how much of a benefit threading will get you here. That said, pydantic's needs are different than those of msgspec, perhaps you have more work you need to do which could be parallelized or done in a background thread.

@ZeN220
Copy link

ZeN220 commented May 20, 2023

This is bench doesn't work on 0.33.0 version pydantic-core:

pydantic_core._pydantic_core.SchemaError: Invalid Schema:
list.items_schema
  Input tag 'new-class' found using 'type' does not match any of the expected tags: 'any', 'none', 'bool', 'int', 'float', 'str', 'bytes', 'date', 'time', 'datetime', 'timedelta', 'literal', 'is-instance', 'is-subclass', 'callable', 'list', 'tuple-positional', 'tuple-variable', 'set', 'frozenset', 'generator', 'dict', 'function-after', 'function-before', 'function-wrap', 'function-plain', 'default', 'nullable', 'union', 'tagged-union', 'chain', 'lax-or-strict', 'json-or-python', 'typed-dict', 'model-fields', 'model', 'dataclass-args', 'dataclass', 'arguments', 'call', 'custom-error', 'json', 'url', 'multi-host-url', 'definitions', 'definition-ref' [type=union_tag_invalid, input_value={'type': 'new-class', 'cl...'}}, 'default': None}}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/0.33.0/v/union_tag_invalid```

@jcrist
Copy link
Author

jcrist commented Jun 28, 2023

Since a few people have asked about how msgspec's performance compares to pydantic v2, I've updated the gist above with a benchmark that works with the current pydantic V2 betas. Since pydantic v1 is available as pydantic.v1 this benchmark also compares pydantic v1.

Results:

$ python bench.py 
msgspec 0.16.0
--------------
dumps: 176.2 us
loads: 487.6 us
total: 663.9 us

pydantic 2.0b3
--------------
dumps: 3667.3 us (20.8x slower)
loads: 5763.2 us (11.8x slower)
total: 9430.5 us (14.2x slower)

pydantic 1.10.9
---------------
dumps: 16389.6 us (93.0x slower)
loads: 65533.8 us (134.4x slower)
total: 81923.4 us (123.4x slower)

This benchmark is a modified version of the one in the msgspec repo. In general my benchmarks show pydantic v2 is ~15-30x slower than msgspec at JSON encoding, and ~6-15x slower at JSON decoding. Whether that matters for your specific application is workload dependent. Also note that I'm not a pydantic expert, this was mainly for my own understanding of how these libraries compare. As always, I recommend doing your own benchmarks when making technical decisions.

@samuelcolvin
Copy link

Definitely worth either removing pydantic v1, or using the actual package - the pydantic.v1 code is not complied with cython unlike installing pydantic==1.10.9.

@jcrist
Copy link
Author

jcrist commented Jun 28, 2023

Thanks for the feedback Samuel! That's fair, although using the compiled version for V1 seems to have a minimal improvement on this benchmark:

msgspec vs pydantic V1 benchmark (using cython compiled pydantic V1 package)
from __future__ import annotations

import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated

import msgspec
import pydantic


def make_filesystem_data(capacity):
    """Generate a tree structure representing a fake filesystem"""
    UTC = datetime.timezone.utc
    DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
    DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
    UUIDS = [str(uuid.uuid4()) for _ in range(30)]

    rand = random.Random(42)

    def randdt(min, max):
        ts = rand.randint(min.timestamp(), max.timestamp())
        return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)

    def randstr(min=None, max=None):
        if max is not None:
            min = rand.randint(min, max)
        return "".join(rand.choices(string.ascii_letters, k=min))

    def make_node(is_dir):
        nonlocal capacity

        name = randstr(4, 30)
        created_by = rand.choice(UUIDS)
        created_at = randdt(DATE_2018, DATE_2023)
        updated_at = randdt(created_at, DATE_2023)
        data = {
            "type": "directory" if is_dir else "file",
            "name": name,
            "created_by": created_by,
            "created_at": created_at.isoformat(),
            "updated_at": updated_at.isoformat(),
        }
        if is_dir:
            n = min(rand.randint(0, 30), capacity)
            capacity -= n
            data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
        else:
            data["nbytes"] = rand.randint(0, 1000000)
        return data

    capacity -= 1
    out = make_node(True)
    while capacity:
        capacity -= 1
        out["contents"].append(make_node(rand.random() > 0.9))
    return out


def bench(raw_data, dumps, loads, convert):
    msg = convert(raw_data)
    json_data = dumps(msg)
    msg2 = loads(json_data)
    assert msg == msg2
    del msg2

    timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
    n, t = timer.autorange()
    dumps_time = t / n

    timer = timeit.Timer(
        "func(data)", setup="", globals={"func": loads, "data": json_data}
    )
    n, t = timer.autorange()
    loads_time = t / n
    return dumps_time, loads_time


#############################################################################
#  msgspec                                                                  #
#############################################################################


class File(msgspec.Struct, tag="file"):
    name: Annotated[str, msgspec.Meta(min_length=1)]
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: Annotated[int, msgspec.Meta(ge=0)]


class Directory(msgspec.Struct, tag="directory"):
    name: Annotated[str, msgspec.Meta(min_length=1)]
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[File, Directory]]


def bench_msgspec(data):
    enc = msgspec.json.Encoder()
    dec = msgspec.json.Decoder(Directory)

    def convert(data):
        return msgspec.convert(data, Directory)

    return bench(data, enc.encode, dec.decode, convert)


#############################################################################
#  pydantic V2                                                              #
#############################################################################


class FileModel(pydantic.BaseModel):
    type: Literal["file"] = "file"
    name: str = pydantic.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: pydantic.NonNegativeInt


class DirectoryModel(pydantic.BaseModel):
    type: Literal["directory"] = "directory"
    name: str = pydantic.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[DirectoryModel, FileModel]]


def bench_pydantic_v1(data):
    return bench(
        data,
        lambda p: p.json(),
        DirectoryModel.parse_raw,
        lambda data: DirectoryModel(**data),
    )


if __name__ == "__main__":
    N = 1000
    data = make_filesystem_data(N)
    ms_dumps, ms_loads = bench_msgspec(data)
    ms_total = ms_dumps + ms_loads
    title = f"msgspec {msgspec.__version__}"
    print(title)
    print("-" * len(title))
    print(f"dumps: {ms_dumps * 1e6:.1f} us")
    print(f"loads: {ms_loads * 1e6:.1f} us")
    print(f"total: {ms_total * 1e6:.1f} us")

    for title, func in [
        (f"pydantic {pydantic.__version__}", bench_pydantic_v1)
    ]:
        print()
        print(title)
        print("-" * len(title))
        dumps, loads = func(data)
        total = dumps + loads
        print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
        print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
        print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")

Output:

msgspec 0.16.0
--------------
dumps: 178.6 us
loads: 497.8 us
total: 676.3 us

pydantic 1.10.9
---------------
dumps: 18206.4 us (102.0x slower)
loads: 55122.1 us (110.7x slower)
total: 73328.5 us (108.4x slower)

Either way, the main point of the benchmark in this gist was to compare pydantic V2 and msgspec, happy to remove v1 if it's a distraction.

@samuelcolvin
Copy link

Up to you, just making the observation really.

@legraphista
Copy link

quick update on the numbers as pydantic v2 became stable:

msgspec 0.16.0
--------------
dumps: 179.3 us
loads: 477.0 us
total: 656.3 us

pydantic 2.0.1
--------------
dumps: 4292.0 us (23.9x slower)
loads: 6666.6 us (14.0x slower)
total: 10958.6 us (16.7x slower)

pydantic 1.10.11
----------------
dumps: 24176.3 us (134.8x slower)
loads: 73471.1 us (154.0x slower)
total: 97647.4 us (148.8x slower)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment