Skip to content

Instantly share code, notes, and snippets.

@jcrist
Last active January 11, 2024 14:34
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
A quick benchmark comparing msgspec (https://github.com/jcrist/msgspec), pydantic v1, and pydantic v2
"""A quick benchmark comparing the performance of:
- msgspec: https://github.com/jcrist/msgspec
- pydantic V1: https://docs.pydantic.dev/1.10/
- pydantic V2: https://docs.pydantic.dev/dev-v2/
The benchmark is modified from the one in the msgspec repo here:
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py
I make no claims that it's illustrative of all use cases. I wrote this up
mostly to get an understanding of how msgspec's performance compares with that
of pydantic V2.
"""
from __future__ import annotations
import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated
import msgspec
import pydantic
import pydantic.v1
def make_filesystem_data(capacity):
"""Generate a tree structure representing a fake filesystem"""
UTC = datetime.timezone.utc
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
UUIDS = [str(uuid.uuid4()) for _ in range(30)]
rand = random.Random(42)
def randdt(min, max):
ts = rand.randint(min.timestamp(), max.timestamp())
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)
def randstr(min=None, max=None):
if max is not None:
min = rand.randint(min, max)
return "".join(rand.choices(string.ascii_letters, k=min))
def make_node(is_dir):
nonlocal capacity
name = randstr(4, 30)
created_by = rand.choice(UUIDS)
created_at = randdt(DATE_2018, DATE_2023)
updated_at = randdt(created_at, DATE_2023)
data = {
"type": "directory" if is_dir else "file",
"name": name,
"created_by": created_by,
"created_at": created_at.isoformat(),
"updated_at": updated_at.isoformat(),
}
if is_dir:
n = min(rand.randint(0, 30), capacity)
capacity -= n
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
else:
data["nbytes"] = rand.randint(0, 1000000)
return data
capacity -= 1
out = make_node(True)
while capacity:
capacity -= 1
out["contents"].append(make_node(rand.random() > 0.9))
return out
def bench(raw_data, dumps, loads, convert):
msg = convert(raw_data)
json_data = dumps(msg)
msg2 = loads(json_data)
assert msg == msg2
del msg2
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
n, t = timer.autorange()
dumps_time = t / n
timer = timeit.Timer(
"func(data)", setup="", globals={"func": loads, "data": json_data}
)
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
#############################################################################
# msgspec #
#############################################################################
class File(msgspec.Struct, tag="file"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: Annotated[int, msgspec.Meta(ge=0)]
class Directory(msgspec.Struct, tag="directory"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[File, Directory]]
def bench_msgspec(data):
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
def convert(data):
return msgspec.convert(data, Directory)
return bench(data, enc.encode, dec.decode, convert)
#############################################################################
# pydantic V2 #
#############################################################################
class FileModel(pydantic.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.NonNegativeInt
class DirectoryModel(pydantic.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModel, FileModel]]
def bench_pydantic_v2(data):
return bench(
data,
lambda p: p.model_dump_json(),
DirectoryModel.model_validate_json,
lambda data: DirectoryModel(**data),
)
#############################################################################
# pydantic V1 #
#############################################################################
class FileModelV1(pydantic.v1.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.v1.NonNegativeInt
class DirectoryModelV1(pydantic.v1.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModelV1, FileModelV1]]
def bench_pydantic_v1(data):
return bench(
data,
lambda p: p.json(),
DirectoryModelV1.parse_raw,
lambda data: DirectoryModelV1(**data),
)
if __name__ == "__main__":
N = 1000
data = make_filesystem_data(N)
ms_dumps, ms_loads = bench_msgspec(data)
ms_total = ms_dumps + ms_loads
title = f"msgspec {msgspec.__version__}"
print(title)
print("-" * len(title))
print(f"dumps: {ms_dumps * 1e6:.1f} us")
print(f"loads: {ms_loads * 1e6:.1f} us")
print(f"total: {ms_total * 1e6:.1f} us")
for title, func in [
(f"pydantic {pydantic.__version__}", bench_pydantic_v2),
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1)
]:
print()
print(title)
print("-" * len(title))
dumps, loads = func(data)
total = dumps + loads
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")
@jcrist
Copy link
Author

jcrist commented Jul 23, 2022

The benchmark above has been updated and expanded, this comment does not reflect current results.

The most recent benchmark results are here: https://gist.github.com/jcrist/d62f450594164d284fbea957fd48b743?permalink_comment_id=4612531#gistcomment-4612531

If you want to see the old benchmark versions, click the "revisions" tab at the top

This benchmark compares the performance of:

The encoded & decoded results appear correct, but I am not a developer of pydantic_core - there is a chance that I have done something naive or wrong here.

$ python bench_pydantic_v2.py
msgspec.json
  dumps: 259.10 us
  loads: 525.30 us
  total: 784.40 us
msgspec.msgpack
  dumps: 182.03 us
  loads: 516.69 us
  total: 698.72 us
pydantic v2 + orjson
  dumps: 424.24 us
  loads: 3359.00 us
  total: 3783.24 us

For this benchmark, msgspec was roughly 6.4X faster for decode and 1.6X faster for encode.

@samuelcolvin
Copy link

Hi @jcrist, thanks so much for this.

First of all, msgspec looks really impressive, congratulations. Although msgspec and pydantic have different aims and features, it's definitely fair to say pydantic now has a new benchmark to work towards.

A few observations on your benchmarks here:

  • The dumping benchmarks can be kind of ignored since you're not actually running any pydantic-core code when dumping here - just orjson processing datacalsses. Dumping in pydantic V2 will be worked on soon, but until then I think it's best to only consider what you call decoding / loading.
  • When setting up pydantic, it's probably fairer to use an empty class for AddressModel and PersonModel, e.g. just
class AddressModel:
    __slots__ = '__dict__', '__fields_set__'

rather than a dataclass, this will provide the same functionality (for decoding / loading / validating) as dataclasses, but saves ~%5.

  • It looks like msgspec.Struct is the fundamental base type for msgspec which is built in C, the equivalent in pydantic-core is really a dict (e.g. with fields defined via a TypedDict), therefore it could be argued that it's fairer to remove the model-class and just use a plain typed-dict in these benchmarks, that's saves ~40%.
  • Lastly (and this is not a reflection on your benchmarks at all), it looks like the JSON decoding is taking a massive amount of the time here for pydantic-core (~75% of load time), I would expect the custom JSON decoder I'm hoping to build once pydantic V2 is released will improve this a lot. Unfortunately it's not possible to compare msgspec and pydantic-core while validating a python object since msgspec obviously only supports JSON and msgpack as inputs.

@jcrist
Copy link
Author

jcrist commented Jul 25, 2022

Thanks @samuelcolvin!

I've definitely put a lot of effort into making msgspec performant and easy to use. I expect over time pydantic-core will close this gap, it's nice to see the speedups you've achieved so far. Normally I wouldn't compare against an unreleased library, but did so since you asked :).

therefore it could be argued that it's fairer to remove the model-class and just use a plain typed-dict in these benchmarks, that's saves ~40%.

I've updated and expanded the benchmark above to reflect your suggestions. No sense in having a flawed benchmark hanging around! I'm seeing a mild speedup using a typed-dict for the pydantic v2 benchmarks, but not a 40% speedup. Can you point out what I'm doing wrong?

New results:

$ python bench.py 
msgspec objects:     512.63 μs
msgspec dicts:       632.61 μs (1.2x slower)
pydantic v2 dicts:   3092.02 μs (6.0x slower)
pydantic v2 objects: 3338.06 μs (6.5x slower)
pydantic v1 objects: 17021.38 μs (33.2x slower)

I would expect the custom JSON decoder I'm hoping to build once pydantic V2 is released will improve this a lot.

I'm curious - I thought you were already using serde-json here to skip the extra object allocation? What more are you hoping to achieve here?

@samuelcolvin
Copy link

(sorry for the slow reply, I somehow missed the notification about your response)

Can you point out what I'm doing wrong?

Nothing exactly "wrong", but you can remove "return_fields_set": True, since we don't need the info on which fields have been set in this case. That should improve performance a bit.

I'm curious - I thought you were already using serde-json here to skip the extra object allocation? What more are you hoping to achieve here?

Currently we decode the JSON into a Value (I have a custom implementation, but it's very close to serde-json's vanilla Value), then we pass that value to the validator which performs the validation. This means:

  • we have to do all the JSON parsing, and allocate maps and arrays before we can start decoding
  • we don't know anything about the field type when parsing the next JSON object

(I assume, msgspec can avoid these allocations and also use information about the field type to take some shortcuts during validation?)

My plan (very rough at the moment) is provide an implementation of pydantic-core's Input which is internally parsing JSON as its methods are called, this should avoid allocation (except in the case of Unions) and allow for lots of optimisations in parsing.

I'm also curious threading and whether I can efficiently separate the tasks of parsing and validation into separate threads to give greater speedup. What do you think?

I think much of this would theoretically be possible with serde-json and its Visitor, but since serde-json doesn't provide line-numbers in a way pydantic-core can use, I think I'll have to build my own library anyway, so I'm putting off further optimisations until I have time to do it properly.

@jcrist
Copy link
Author

jcrist commented Aug 4, 2022

Nothing exactly "wrong", but you can remove "return_fields_set": True, since we don't need the info on which fields have been set in this case. That should improve performance a bit.

Thanks. I've updated the benchmark again, final results (based on pydantic-core commit 8f60642e2767c2b17b8d558500f0650942599ebb):

$ python bench.py
msgspec objects:     504.59 μs
msgspec dicts:       636.33 μs (1.3x slower)
pydantic v2 dicts:   2408.97 μs (4.8x slower)
pydantic v2 objects: 3462.42 μs (6.9x slower)
pydantic v1 objects: 17243.01 μs (34.2x slower)
pydantic v1 dicts:   43185.05 μs (85.6x slower)

(I assume, msgspec can avoid these allocations and also use information about the field type to take some shortcuts during validation?)

This is correct. msgspec does the minimal amount of work required when decoding - at this point most of the bottleneck in any decode step is in cpython allocation costs (sometimes up to 80% of the time). We even have a bunch of hacks to avoid allocating if unnecessary. I don't think we can get much faster here.

I'm also curious threading and whether I can efficiently separate the tasks of parsing and validation into separate threads to give greater speedup. What do you think?

Given that msgspec is mostly bound by cpython allocation/initialization (which requires the GIL), I'm not sure how much of a benefit threading will get you here. That said, pydantic's needs are different than those of msgspec, perhaps you have more work you need to do which could be parallelized or done in a background thread.

@ZeN220
Copy link

ZeN220 commented May 20, 2023

This is bench doesn't work on 0.33.0 version pydantic-core:

pydantic_core._pydantic_core.SchemaError: Invalid Schema:
list.items_schema
  Input tag 'new-class' found using 'type' does not match any of the expected tags: 'any', 'none', 'bool', 'int', 'float', 'str', 'bytes', 'date', 'time', 'datetime', 'timedelta', 'literal', 'is-instance', 'is-subclass', 'callable', 'list', 'tuple-positional', 'tuple-variable', 'set', 'frozenset', 'generator', 'dict', 'function-after', 'function-before', 'function-wrap', 'function-plain', 'default', 'nullable', 'union', 'tagged-union', 'chain', 'lax-or-strict', 'json-or-python', 'typed-dict', 'model-fields', 'model', 'dataclass-args', 'dataclass', 'arguments', 'call', 'custom-error', 'json', 'url', 'multi-host-url', 'definitions', 'definition-ref' [type=union_tag_invalid, input_value={'type': 'new-class', 'cl...'}}, 'default': None}}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/0.33.0/v/union_tag_invalid```

@jcrist
Copy link
Author

jcrist commented Jun 28, 2023

Since a few people have asked about how msgspec's performance compares to pydantic v2, I've updated the gist above with a benchmark that works with the current pydantic V2 betas. Since pydantic v1 is available as pydantic.v1 this benchmark also compares pydantic v1.

Results:

$ python bench.py 
msgspec 0.16.0
--------------
dumps: 176.2 us
loads: 487.6 us
total: 663.9 us

pydantic 2.0b3
--------------
dumps: 3667.3 us (20.8x slower)
loads: 5763.2 us (11.8x slower)
total: 9430.5 us (14.2x slower)

pydantic 1.10.9
---------------
dumps: 16389.6 us (93.0x slower)
loads: 65533.8 us (134.4x slower)
total: 81923.4 us (123.4x slower)

This benchmark is a modified version of the one in the msgspec repo. In general my benchmarks show pydantic v2 is ~15-30x slower than msgspec at JSON encoding, and ~6-15x slower at JSON decoding. Whether that matters for your specific application is workload dependent. Also note that I'm not a pydantic expert, this was mainly for my own understanding of how these libraries compare. As always, I recommend doing your own benchmarks when making technical decisions.

@samuelcolvin
Copy link

Definitely worth either removing pydantic v1, or using the actual package - the pydantic.v1 code is not complied with cython unlike installing pydantic==1.10.9.

@jcrist
Copy link
Author

jcrist commented Jun 28, 2023

Thanks for the feedback Samuel! That's fair, although using the compiled version for V1 seems to have a minimal improvement on this benchmark:

msgspec vs pydantic V1 benchmark (using cython compiled pydantic V1 package)
from __future__ import annotations

import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated

import msgspec
import pydantic


def make_filesystem_data(capacity):
    """Generate a tree structure representing a fake filesystem"""
    UTC = datetime.timezone.utc
    DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
    DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
    UUIDS = [str(uuid.uuid4()) for _ in range(30)]

    rand = random.Random(42)

    def randdt(min, max):
        ts = rand.randint(min.timestamp(), max.timestamp())
        return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)

    def randstr(min=None, max=None):
        if max is not None:
            min = rand.randint(min, max)
        return "".join(rand.choices(string.ascii_letters, k=min))

    def make_node(is_dir):
        nonlocal capacity

        name = randstr(4, 30)
        created_by = rand.choice(UUIDS)
        created_at = randdt(DATE_2018, DATE_2023)
        updated_at = randdt(created_at, DATE_2023)
        data = {
            "type": "directory" if is_dir else "file",
            "name": name,
            "created_by": created_by,
            "created_at": created_at.isoformat(),
            "updated_at": updated_at.isoformat(),
        }
        if is_dir:
            n = min(rand.randint(0, 30), capacity)
            capacity -= n
            data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
        else:
            data["nbytes"] = rand.randint(0, 1000000)
        return data

    capacity -= 1
    out = make_node(True)
    while capacity:
        capacity -= 1
        out["contents"].append(make_node(rand.random() > 0.9))
    return out


def bench(raw_data, dumps, loads, convert):
    msg = convert(raw_data)
    json_data = dumps(msg)
    msg2 = loads(json_data)
    assert msg == msg2
    del msg2

    timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
    n, t = timer.autorange()
    dumps_time = t / n

    timer = timeit.Timer(
        "func(data)", setup="", globals={"func": loads, "data": json_data}
    )
    n, t = timer.autorange()
    loads_time = t / n
    return dumps_time, loads_time


#############################################################################
#  msgspec                                                                  #
#############################################################################


class File(msgspec.Struct, tag="file"):
    name: Annotated[str, msgspec.Meta(min_length=1)]
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: Annotated[int, msgspec.Meta(ge=0)]


class Directory(msgspec.Struct, tag="directory"):
    name: Annotated[str, msgspec.Meta(min_length=1)]
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[File, Directory]]


def bench_msgspec(data):
    enc = msgspec.json.Encoder()
    dec = msgspec.json.Decoder(Directory)

    def convert(data):
        return msgspec.convert(data, Directory)

    return bench(data, enc.encode, dec.decode, convert)


#############################################################################
#  pydantic V2                                                              #
#############################################################################


class FileModel(pydantic.BaseModel):
    type: Literal["file"] = "file"
    name: str = pydantic.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: pydantic.NonNegativeInt


class DirectoryModel(pydantic.BaseModel):
    type: Literal["directory"] = "directory"
    name: str = pydantic.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[DirectoryModel, FileModel]]


def bench_pydantic_v1(data):
    return bench(
        data,
        lambda p: p.json(),
        DirectoryModel.parse_raw,
        lambda data: DirectoryModel(**data),
    )


if __name__ == "__main__":
    N = 1000
    data = make_filesystem_data(N)
    ms_dumps, ms_loads = bench_msgspec(data)
    ms_total = ms_dumps + ms_loads
    title = f"msgspec {msgspec.__version__}"
    print(title)
    print("-" * len(title))
    print(f"dumps: {ms_dumps * 1e6:.1f} us")
    print(f"loads: {ms_loads * 1e6:.1f} us")
    print(f"total: {ms_total * 1e6:.1f} us")

    for title, func in [
        (f"pydantic {pydantic.__version__}", bench_pydantic_v1)
    ]:
        print()
        print(title)
        print("-" * len(title))
        dumps, loads = func(data)
        total = dumps + loads
        print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
        print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
        print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")

Output:

msgspec 0.16.0
--------------
dumps: 178.6 us
loads: 497.8 us
total: 676.3 us

pydantic 1.10.9
---------------
dumps: 18206.4 us (102.0x slower)
loads: 55122.1 us (110.7x slower)
total: 73328.5 us (108.4x slower)

Either way, the main point of the benchmark in this gist was to compare pydantic V2 and msgspec, happy to remove v1 if it's a distraction.

@samuelcolvin
Copy link

Up to you, just making the observation really.

@legraphista
Copy link

quick update on the numbers as pydantic v2 became stable:

msgspec 0.16.0
--------------
dumps: 179.3 us
loads: 477.0 us
total: 656.3 us

pydantic 2.0.1
--------------
dumps: 4292.0 us (23.9x slower)
loads: 6666.6 us (14.0x slower)
total: 10958.6 us (16.7x slower)

pydantic 1.10.11
----------------
dumps: 24176.3 us (134.8x slower)
loads: 73471.1 us (154.0x slower)
total: 97647.4 us (148.8x slower)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment