Skip to content

Instantly share code, notes, and snippets.

@jcrist
Last active January 11, 2024 14:34
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
A quick benchmark comparing msgspec (https://github.com/jcrist/msgspec), pydantic v1, and pydantic v2
"""A quick benchmark comparing the performance of:
- msgspec: https://github.com/jcrist/msgspec
- pydantic V1: https://docs.pydantic.dev/1.10/
- pydantic V2: https://docs.pydantic.dev/dev-v2/
The benchmark is modified from the one in the msgspec repo here:
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py
I make no claims that it's illustrative of all use cases. I wrote this up
mostly to get an understanding of how msgspec's performance compares with that
of pydantic V2.
"""
from __future__ import annotations
import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated
import msgspec
import pydantic
import pydantic.v1
def make_filesystem_data(capacity):
"""Generate a tree structure representing a fake filesystem"""
UTC = datetime.timezone.utc
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
UUIDS = [str(uuid.uuid4()) for _ in range(30)]
rand = random.Random(42)
def randdt(min, max):
ts = rand.randint(min.timestamp(), max.timestamp())
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)
def randstr(min=None, max=None):
if max is not None:
min = rand.randint(min, max)
return "".join(rand.choices(string.ascii_letters, k=min))
def make_node(is_dir):
nonlocal capacity
name = randstr(4, 30)
created_by = rand.choice(UUIDS)
created_at = randdt(DATE_2018, DATE_2023)
updated_at = randdt(created_at, DATE_2023)
data = {
"type": "directory" if is_dir else "file",
"name": name,
"created_by": created_by,
"created_at": created_at.isoformat(),
"updated_at": updated_at.isoformat(),
}
if is_dir:
n = min(rand.randint(0, 30), capacity)
capacity -= n
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
else:
data["nbytes"] = rand.randint(0, 1000000)
return data
capacity -= 1
out = make_node(True)
while capacity:
capacity -= 1
out["contents"].append(make_node(rand.random() > 0.9))
return out
def bench(raw_data, dumps, loads, convert):
msg = convert(raw_data)
json_data = dumps(msg)
msg2 = loads(json_data)
assert msg == msg2
del msg2
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
n, t = timer.autorange()
dumps_time = t / n
timer = timeit.Timer(
"func(data)", setup="", globals={"func": loads, "data": json_data}
)
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
#############################################################################
# msgspec #
#############################################################################
class File(msgspec.Struct, tag="file"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: Annotated[int, msgspec.Meta(ge=0)]
class Directory(msgspec.Struct, tag="directory"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[File, Directory]]
def bench_msgspec(data):
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
def convert(data):
return msgspec.convert(data, Directory)
return bench(data, enc.encode, dec.decode, convert)
#############################################################################
# pydantic V2 #
#############################################################################
class FileModel(pydantic.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.NonNegativeInt
class DirectoryModel(pydantic.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModel, FileModel]]
def bench_pydantic_v2(data):
return bench(
data,
lambda p: p.model_dump_json(),
DirectoryModel.model_validate_json,
lambda data: DirectoryModel(**data),
)
#############################################################################
# pydantic V1 #
#############################################################################
class FileModelV1(pydantic.v1.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.v1.NonNegativeInt
class DirectoryModelV1(pydantic.v1.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModelV1, FileModelV1]]
def bench_pydantic_v1(data):
return bench(
data,
lambda p: p.json(),
DirectoryModelV1.parse_raw,
lambda data: DirectoryModelV1(**data),
)
if __name__ == "__main__":
N = 1000
data = make_filesystem_data(N)
ms_dumps, ms_loads = bench_msgspec(data)
ms_total = ms_dumps + ms_loads
title = f"msgspec {msgspec.__version__}"
print(title)
print("-" * len(title))
print(f"dumps: {ms_dumps * 1e6:.1f} us")
print(f"loads: {ms_loads * 1e6:.1f} us")
print(f"total: {ms_total * 1e6:.1f} us")
for title, func in [
(f"pydantic {pydantic.__version__}", bench_pydantic_v2),
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1)
]:
print()
print(title)
print("-" * len(title))
dumps, loads = func(data)
total = dumps + loads
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")
@ZeN220
Copy link

ZeN220 commented May 20, 2023

This is bench doesn't work on 0.33.0 version pydantic-core:

pydantic_core._pydantic_core.SchemaError: Invalid Schema:
list.items_schema
  Input tag 'new-class' found using 'type' does not match any of the expected tags: 'any', 'none', 'bool', 'int', 'float', 'str', 'bytes', 'date', 'time', 'datetime', 'timedelta', 'literal', 'is-instance', 'is-subclass', 'callable', 'list', 'tuple-positional', 'tuple-variable', 'set', 'frozenset', 'generator', 'dict', 'function-after', 'function-before', 'function-wrap', 'function-plain', 'default', 'nullable', 'union', 'tagged-union', 'chain', 'lax-or-strict', 'json-or-python', 'typed-dict', 'model-fields', 'model', 'dataclass-args', 'dataclass', 'arguments', 'call', 'custom-error', 'json', 'url', 'multi-host-url', 'definitions', 'definition-ref' [type=union_tag_invalid, input_value={'type': 'new-class', 'cl...'}}, 'default': None}}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/0.33.0/v/union_tag_invalid```

@jcrist
Copy link
Author

jcrist commented Jun 28, 2023

Since a few people have asked about how msgspec's performance compares to pydantic v2, I've updated the gist above with a benchmark that works with the current pydantic V2 betas. Since pydantic v1 is available as pydantic.v1 this benchmark also compares pydantic v1.

Results:

$ python bench.py 
msgspec 0.16.0
--------------
dumps: 176.2 us
loads: 487.6 us
total: 663.9 us

pydantic 2.0b3
--------------
dumps: 3667.3 us (20.8x slower)
loads: 5763.2 us (11.8x slower)
total: 9430.5 us (14.2x slower)

pydantic 1.10.9
---------------
dumps: 16389.6 us (93.0x slower)
loads: 65533.8 us (134.4x slower)
total: 81923.4 us (123.4x slower)

This benchmark is a modified version of the one in the msgspec repo. In general my benchmarks show pydantic v2 is ~15-30x slower than msgspec at JSON encoding, and ~6-15x slower at JSON decoding. Whether that matters for your specific application is workload dependent. Also note that I'm not a pydantic expert, this was mainly for my own understanding of how these libraries compare. As always, I recommend doing your own benchmarks when making technical decisions.

@samuelcolvin
Copy link

Definitely worth either removing pydantic v1, or using the actual package - the pydantic.v1 code is not complied with cython unlike installing pydantic==1.10.9.

@jcrist
Copy link
Author

jcrist commented Jun 28, 2023

Thanks for the feedback Samuel! That's fair, although using the compiled version for V1 seems to have a minimal improvement on this benchmark:

msgspec vs pydantic V1 benchmark (using cython compiled pydantic V1 package)
from __future__ import annotations

import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated

import msgspec
import pydantic


def make_filesystem_data(capacity):
    """Generate a tree structure representing a fake filesystem"""
    UTC = datetime.timezone.utc
    DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
    DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
    UUIDS = [str(uuid.uuid4()) for _ in range(30)]

    rand = random.Random(42)

    def randdt(min, max):
        ts = rand.randint(min.timestamp(), max.timestamp())
        return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)

    def randstr(min=None, max=None):
        if max is not None:
            min = rand.randint(min, max)
        return "".join(rand.choices(string.ascii_letters, k=min))

    def make_node(is_dir):
        nonlocal capacity

        name = randstr(4, 30)
        created_by = rand.choice(UUIDS)
        created_at = randdt(DATE_2018, DATE_2023)
        updated_at = randdt(created_at, DATE_2023)
        data = {
            "type": "directory" if is_dir else "file",
            "name": name,
            "created_by": created_by,
            "created_at": created_at.isoformat(),
            "updated_at": updated_at.isoformat(),
        }
        if is_dir:
            n = min(rand.randint(0, 30), capacity)
            capacity -= n
            data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
        else:
            data["nbytes"] = rand.randint(0, 1000000)
        return data

    capacity -= 1
    out = make_node(True)
    while capacity:
        capacity -= 1
        out["contents"].append(make_node(rand.random() > 0.9))
    return out


def bench(raw_data, dumps, loads, convert):
    msg = convert(raw_data)
    json_data = dumps(msg)
    msg2 = loads(json_data)
    assert msg == msg2
    del msg2

    timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
    n, t = timer.autorange()
    dumps_time = t / n

    timer = timeit.Timer(
        "func(data)", setup="", globals={"func": loads, "data": json_data}
    )
    n, t = timer.autorange()
    loads_time = t / n
    return dumps_time, loads_time


#############################################################################
#  msgspec                                                                  #
#############################################################################


class File(msgspec.Struct, tag="file"):
    name: Annotated[str, msgspec.Meta(min_length=1)]
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: Annotated[int, msgspec.Meta(ge=0)]


class Directory(msgspec.Struct, tag="directory"):
    name: Annotated[str, msgspec.Meta(min_length=1)]
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[File, Directory]]


def bench_msgspec(data):
    enc = msgspec.json.Encoder()
    dec = msgspec.json.Decoder(Directory)

    def convert(data):
        return msgspec.convert(data, Directory)

    return bench(data, enc.encode, dec.decode, convert)


#############################################################################
#  pydantic V2                                                              #
#############################################################################


class FileModel(pydantic.BaseModel):
    type: Literal["file"] = "file"
    name: str = pydantic.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: pydantic.NonNegativeInt


class DirectoryModel(pydantic.BaseModel):
    type: Literal["directory"] = "directory"
    name: str = pydantic.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[DirectoryModel, FileModel]]


def bench_pydantic_v1(data):
    return bench(
        data,
        lambda p: p.json(),
        DirectoryModel.parse_raw,
        lambda data: DirectoryModel(**data),
    )


if __name__ == "__main__":
    N = 1000
    data = make_filesystem_data(N)
    ms_dumps, ms_loads = bench_msgspec(data)
    ms_total = ms_dumps + ms_loads
    title = f"msgspec {msgspec.__version__}"
    print(title)
    print("-" * len(title))
    print(f"dumps: {ms_dumps * 1e6:.1f} us")
    print(f"loads: {ms_loads * 1e6:.1f} us")
    print(f"total: {ms_total * 1e6:.1f} us")

    for title, func in [
        (f"pydantic {pydantic.__version__}", bench_pydantic_v1)
    ]:
        print()
        print(title)
        print("-" * len(title))
        dumps, loads = func(data)
        total = dumps + loads
        print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
        print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
        print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")

Output:

msgspec 0.16.0
--------------
dumps: 178.6 us
loads: 497.8 us
total: 676.3 us

pydantic 1.10.9
---------------
dumps: 18206.4 us (102.0x slower)
loads: 55122.1 us (110.7x slower)
total: 73328.5 us (108.4x slower)

Either way, the main point of the benchmark in this gist was to compare pydantic V2 and msgspec, happy to remove v1 if it's a distraction.

@samuelcolvin
Copy link

Up to you, just making the observation really.

@legraphista
Copy link

quick update on the numbers as pydantic v2 became stable:

msgspec 0.16.0
--------------
dumps: 179.3 us
loads: 477.0 us
total: 656.3 us

pydantic 2.0.1
--------------
dumps: 4292.0 us (23.9x slower)
loads: 6666.6 us (14.0x slower)
total: 10958.6 us (16.7x slower)

pydantic 1.10.11
----------------
dumps: 24176.3 us (134.8x slower)
loads: 73471.1 us (154.0x slower)
total: 97647.4 us (148.8x slower)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment