Skip to content

Instantly share code, notes, and snippets.

@jcrist
Last active June 18, 2024 08:46
Show Gist options
  • Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
A quick benchmark comparing msgspec (https://github.com/jcrist/msgspec), pydantic v1, and pydantic v2
"""A quick benchmark comparing the performance of:
- msgspec: https://github.com/jcrist/msgspec
- pydantic V1: https://docs.pydantic.dev/1.10/
- pydantic V2: https://docs.pydantic.dev/dev-v2/
The benchmark is modified from the one in the msgspec repo here:
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py
I make no claims that it's illustrative of all use cases. I wrote this up
mostly to get an understanding of how msgspec's performance compares with that
of pydantic V2.
"""
from __future__ import annotations
import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated
import msgspec
import pydantic
import pydantic.v1
def make_filesystem_data(capacity):
"""Generate a tree structure representing a fake filesystem"""
UTC = datetime.timezone.utc
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
UUIDS = [str(uuid.uuid4()) for _ in range(30)]
rand = random.Random(42)
def randdt(min, max):
ts = rand.randint(min.timestamp(), max.timestamp())
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)
def randstr(min=None, max=None):
if max is not None:
min = rand.randint(min, max)
return "".join(rand.choices(string.ascii_letters, k=min))
def make_node(is_dir):
nonlocal capacity
name = randstr(4, 30)
created_by = rand.choice(UUIDS)
created_at = randdt(DATE_2018, DATE_2023)
updated_at = randdt(created_at, DATE_2023)
data = {
"type": "directory" if is_dir else "file",
"name": name,
"created_by": created_by,
"created_at": created_at.isoformat(),
"updated_at": updated_at.isoformat(),
}
if is_dir:
n = min(rand.randint(0, 30), capacity)
capacity -= n
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
else:
data["nbytes"] = rand.randint(0, 1000000)
return data
capacity -= 1
out = make_node(True)
while capacity:
capacity -= 1
out["contents"].append(make_node(rand.random() > 0.9))
return out
def bench(raw_data, dumps, loads, convert):
msg = convert(raw_data)
json_data = dumps(msg)
msg2 = loads(json_data)
assert msg == msg2
del msg2
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
n, t = timer.autorange()
dumps_time = t / n
timer = timeit.Timer(
"func(data)", setup="", globals={"func": loads, "data": json_data}
)
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
#############################################################################
# msgspec #
#############################################################################
class File(msgspec.Struct, tag="file"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: Annotated[int, msgspec.Meta(ge=0)]
class Directory(msgspec.Struct, tag="directory"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[File, Directory]]
def bench_msgspec(data):
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
def convert(data):
return msgspec.convert(data, Directory)
return bench(data, enc.encode, dec.decode, convert)
#############################################################################
# pydantic V2 #
#############################################################################
class FileModel(pydantic.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.NonNegativeInt
class DirectoryModel(pydantic.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModel, FileModel]]
def bench_pydantic_v2(data):
return bench(
data,
lambda p: p.model_dump_json(),
DirectoryModel.model_validate_json,
lambda data: DirectoryModel(**data),
)
#############################################################################
# pydantic V1 #
#############################################################################
class FileModelV1(pydantic.v1.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.v1.NonNegativeInt
class DirectoryModelV1(pydantic.v1.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModelV1, FileModelV1]]
def bench_pydantic_v1(data):
return bench(
data,
lambda p: p.json(),
DirectoryModelV1.parse_raw,
lambda data: DirectoryModelV1(**data),
)
if __name__ == "__main__":
N = 1000
data = make_filesystem_data(N)
ms_dumps, ms_loads = bench_msgspec(data)
ms_total = ms_dumps + ms_loads
title = f"msgspec {msgspec.__version__}"
print(title)
print("-" * len(title))
print(f"dumps: {ms_dumps * 1e6:.1f} us")
print(f"loads: {ms_loads * 1e6:.1f} us")
print(f"total: {ms_total * 1e6:.1f} us")
for title, func in [
(f"pydantic {pydantic.__version__}", bench_pydantic_v2),
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1)
]:
print()
print(title)
print("-" * len(title))
dumps, loads = func(data)
total = dumps + loads
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")
@legraphista
Copy link

quick update on the numbers as pydantic v2 became stable:

msgspec 0.16.0
--------------
dumps: 179.3 us
loads: 477.0 us
total: 656.3 us

pydantic 2.0.1
--------------
dumps: 4292.0 us (23.9x slower)
loads: 6666.6 us (14.0x slower)
total: 10958.6 us (16.7x slower)

pydantic 1.10.11
----------------
dumps: 24176.3 us (134.8x slower)
loads: 73471.1 us (154.0x slower)
total: 97647.4 us (148.8x slower)

@nrbnlulu
Copy link

Fix python 3.12
https://gist.github.com/jcrist/d62f450594164d284fbea957fd48b743#file-bench-py-L38
should be

ts = rand.randint(int(min.timestamp()), int(max.timestamp()))

@nrbnlulu
Copy link

BTW @samuelcolvin You said that

Although msgspec and pydantic have different aims and features

What are the different aims if I may ask?

@nrbnlulu
Copy link

Leaving here my benchmark results

import json
import timeit
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Iterator, TypedDict

import mimesis
import msgspec
import pydantic
from pydantic.type_adapter import TypeAdapter

provider = mimesis.Generic()
def create_user() -> dict:
    return {
        "id": provider.person.identifier(),
        "username": provider.person.username(),
        "password": provider.person.password(),
        "email": provider.person.email(),
        "blog": provider.internet.url(),
        "first_name": provider.person.name(),
        "last_name": provider.person.last_name(),
        "is_active": provider.development.boolean(),
        "is_staff": provider.development.boolean(),
        "is_superuser": provider.development.boolean(),
        "date_joined": provider.person.birthdate(),
        "last_login": provider.person.birthdate(),
        "friend": create_user() if provider.development.boolean() else None
    }



data = [create_user() for _ in range(100000)]
data_raw = msgspec.json.encode(data)


class MsgSpecUser(msgspec.Struct):
    id: str
    username: str
    password: str
    email: str
    blog: str
    first_name: str
    last_name: str
    is_active: bool
    is_staff: bool
    is_superuser: bool
    date_joined: str
    last_login: str
    friend: "MsgSpecUser | None"

class PydanticUser(pydantic.BaseModel):
    id: str
    username: str
    password: str
    email: str
    blog: str
    first_name: str
    last_name: str
    is_active: bool
    is_staff: bool
    is_superuser: bool
    date_joined: str
    last_login: str
    friend: "PydanticUser | None"

@dataclass
class TimeitResult:
    task: str 
    seconds: float | None = None

@contextmanager
def time_it(task: str) -> Iterator[TimeitResult]:
    start = timeit.default_timer()
    res = TimeitResult(task=task)
    yield res
    end = timeit.default_timer()
    print(f"{task} took {end - start:1f} seconds")
    res.seconds = end - start

def match_precentage(pydantic: float, msgspec: float) -> str:
    if pydantic < msgspec:
        return f"Pydantic is faster by %{((msgspec - pydantic) / pydantic) * 100:1f}"
    return f"MsgSpec is faster by %{((pydantic - msgspec) / msgspec) * 100:1f}"

msgspec_decoder = msgspec.json.Decoder(list[MsgSpecUser])

with time_it("msgspec_decode") as msgspec_res:
    msgspec_data = msgspec_decoder.decode(data_raw)

users_ta = TypeAdapter(list[PydanticUser])

with time_it("pydantic_decode") as pydantic_res:
    pydantic_data = users_ta.validate_json(data_raw)

print(f"DECODE: {match_precentage(pydantic_res.seconds, msgspec_res.seconds)}")

# ------------ encode ------------

msgspec_encoder = msgspec.json.Encoder()


with time_it("msgspec_encode") as msgspec_res:
    msgspec_data_raw = msgspec_encoder.encode(msgspec_data)

with time_it("pydantic_encode") as pydantic_res:
    pydantic_data_raw = users_ta.dump_json(pydantic_data)


print(f"ENCODE: {match_precentage(pydantic_res.seconds, msgspec_res.seconds)}")
msgspec_decode took 0.162186 seconds
pydantic_decode took 1.120969 seconds
DECODE: MsgSpec is faster by %591.163625
msgspec_encode took 0.044265 seconds
pydantic_encode took 0.223537 seconds
ENCODE: MsgSpec is faster by %404.997775

@nrbnlulu
Copy link

nrbnlulu commented Jun 18, 2024

Created another benchmark that uses custom types
https://gist.github.com/nrbnlulu/e983ab23bed5806cff5bb8ba97434d6d

results are quite surprising

msgspec_decode took 0.050580 seconds
pydantic_decode took 0.150948 seconds
DECODE: MsgSpec is faster by %198.433165
msgspec_encode took 0.015060 seconds
pydantic_encode took 0.060530 seconds
ENCODE: MsgSpec is faster by %301.920586

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment