jcrist/bench.py

## bench.py
"""A quick benchmark comparing the performance of:

- msgspec: https://github.com/jcrist/msgspec
- pydantic V1: https://docs.pydantic.dev/1.10/
- pydantic V2: https://docs.pydantic.dev/dev-v2/

The benchmark is modified from the one in the msgspec repo here:
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py

I make no claims that it's illustrative of all use cases. I wrote this up
mostly to get an understanding of how msgspec's performance compares with that
of pydantic V2.
"""
from __future__ import annotations

import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated

import msgspec
import pydantic
import pydantic.v1


def make_filesystem_data(capacity):
    """Generate a tree structure representing a fake filesystem"""
    UTC = datetime.timezone.utc
    DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
    DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
    UUIDS = [str(uuid.uuid4()) for _ in range(30)]

    rand = random.Random(42)

    def randdt(min, max):
        ts = rand.randint(min.timestamp(), max.timestamp())
        return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)

    def randstr(min=None, max=None):
        if max is not None:
            min = rand.randint(min, max)
        return "".join(rand.choices(string.ascii_letters, k=min))

    def make_node(is_dir):
        nonlocal capacity

        name = randstr(4, 30)
        created_by = rand.choice(UUIDS)
        created_at = randdt(DATE_2018, DATE_2023)
        updated_at = randdt(created_at, DATE_2023)
        data = {
            "type": "directory" if is_dir else "file",
            "name": name,
            "created_by": created_by,
            "created_at": created_at.isoformat(),
            "updated_at": updated_at.isoformat(),
        }
        if is_dir:
            n = min(rand.randint(0, 30), capacity)
            capacity -= n
            data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
        else:
            data["nbytes"] = rand.randint(0, 1000000)
        return data

    capacity -= 1
    out = make_node(True)
    while capacity:
        capacity -= 1
        out["contents"].append(make_node(rand.random() > 0.9))
    return out


def bench(raw_data, dumps, loads, convert):
    msg = convert(raw_data)
    json_data = dumps(msg)
    msg2 = loads(json_data)
    assert msg == msg2
    del msg2

    timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
    n, t = timer.autorange()
    dumps_time = t / n

    timer = timeit.Timer(
        "func(data)", setup="", globals={"func": loads, "data": json_data}
    )
    n, t = timer.autorange()
    loads_time = t / n
    return dumps_time, loads_time


#############################################################################
#  msgspec                                                                  #
#############################################################################


class File(msgspec.Struct, tag="file"):
    name: Annotated[str, msgspec.Meta(min_length=1)]
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: Annotated[int, msgspec.Meta(ge=0)]


class Directory(msgspec.Struct, tag="directory"):
    name: Annotated[str, msgspec.Meta(min_length=1)]
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[File, Directory]]


def bench_msgspec(data):
    enc = msgspec.json.Encoder()
    dec = msgspec.json.Decoder(Directory)

    def convert(data):
        return msgspec.convert(data, Directory)

    return bench(data, enc.encode, dec.decode, convert)


#############################################################################
#  pydantic V2                                                              #
#############################################################################


class FileModel(pydantic.BaseModel):
    type: Literal["file"] = "file"
    name: str = pydantic.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: pydantic.NonNegativeInt


class DirectoryModel(pydantic.BaseModel):
    type: Literal["directory"] = "directory"
    name: str = pydantic.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[DirectoryModel, FileModel]]


def bench_pydantic_v2(data):
    return bench(
        data,
        lambda p: p.model_dump_json(),
        DirectoryModel.model_validate_json,
        lambda data: DirectoryModel(**data),
    )

#############################################################################
#  pydantic V1                                                              #
#############################################################################


class FileModelV1(pydantic.v1.BaseModel):
    type: Literal["file"] = "file"
    name: str = pydantic.v1.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    nbytes: pydantic.v1.NonNegativeInt


class DirectoryModelV1(pydantic.v1.BaseModel):
    type: Literal["directory"] = "directory"
    name: str = pydantic.v1.Field(min_length=1)
    created_by: uuid.UUID
    created_at: datetime.datetime
    updated_at: datetime.datetime
    contents: List[Union[DirectoryModelV1, FileModelV1]]


def bench_pydantic_v1(data):
    return bench(
        data,
        lambda p: p.json(),
        DirectoryModelV1.parse_raw,
        lambda data: DirectoryModelV1(**data),
    )


if __name__ == "__main__":
    N = 1000
    data = make_filesystem_data(N)
    ms_dumps, ms_loads = bench_msgspec(data)
    ms_total = ms_dumps + ms_loads
    title = f"msgspec {msgspec.__version__}"
    print(title)
    print("-" * len(title))
    print(f"dumps: {ms_dumps * 1e6:.1f} us")
    print(f"loads: {ms_loads * 1e6:.1f} us")
    print(f"total: {ms_total * 1e6:.1f} us")

    for title, func in [
        (f"pydantic {pydantic.__version__}", bench_pydantic_v2),
        (f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1)
    ]:
        print()
        print(title)
        print("-" * len(title))
        dumps, loads = func(data)
        total = dumps + loads
        print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
        print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
        print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")
	"""A quick benchmark comparing the performance of:

	- msgspec: https://github.com/jcrist/msgspec
	- pydantic V1: https://docs.pydantic.dev/1.10/
	- pydantic V2: https://docs.pydantic.dev/dev-v2/

	The benchmark is modified from the one in the msgspec repo here:
	https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py

	I make no claims that it's illustrative of all use cases. I wrote this up
	mostly to get an understanding of how msgspec's performance compares with that
	of pydantic V2.
	"""
	from __future__ import annotations

	import datetime
	import random
	import string
	import timeit
	import uuid
	from typing import List, Literal, Union, Annotated

	import msgspec
	import pydantic
	import pydantic.v1


	def make_filesystem_data(capacity):
	"""Generate a tree structure representing a fake filesystem"""
	UTC = datetime.timezone.utc
	DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
	DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
	UUIDS = [str(uuid.uuid4()) for _ in range(30)]

	rand = random.Random(42)

	def randdt(min, max):
	ts = rand.randint(min.timestamp(), max.timestamp())
	return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)

	def randstr(min=None, max=None):
	if max is not None:
	min = rand.randint(min, max)
	return "".join(rand.choices(string.ascii_letters, k=min))

	def make_node(is_dir):
	nonlocal capacity

	name = randstr(4, 30)
	created_by = rand.choice(UUIDS)
	created_at = randdt(DATE_2018, DATE_2023)
	updated_at = randdt(created_at, DATE_2023)
	data = {
	"type": "directory" if is_dir else "file",
	"name": name,
	"created_by": created_by,
	"created_at": created_at.isoformat(),
	"updated_at": updated_at.isoformat(),
	}
	if is_dir:
	n = min(rand.randint(0, 30), capacity)
	capacity -= n
	data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
	else:
	data["nbytes"] = rand.randint(0, 1000000)
	return data

	capacity -= 1
	out = make_node(True)
	while capacity:
	capacity -= 1
	out["contents"].append(make_node(rand.random() > 0.9))
	return out


	def bench(raw_data, dumps, loads, convert):
	msg = convert(raw_data)
	json_data = dumps(msg)
	msg2 = loads(json_data)
	assert msg == msg2
	del msg2

	timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
	n, t = timer.autorange()
	dumps_time = t / n

	timer = timeit.Timer(
	"func(data)", setup="", globals={"func": loads, "data": json_data}
	)
	n, t = timer.autorange()
	loads_time = t / n
	return dumps_time, loads_time


	#############################################################################
	# msgspec #
	#############################################################################


	class File(msgspec.Struct, tag="file"):
	name: Annotated[str, msgspec.Meta(min_length=1)]
	created_by: uuid.UUID
	created_at: datetime.datetime
	updated_at: datetime.datetime
	nbytes: Annotated[int, msgspec.Meta(ge=0)]


	class Directory(msgspec.Struct, tag="directory"):
	name: Annotated[str, msgspec.Meta(min_length=1)]
	created_by: uuid.UUID
	created_at: datetime.datetime
	updated_at: datetime.datetime
	contents: List[Union[File, Directory]]


	def bench_msgspec(data):
	enc = msgspec.json.Encoder()
	dec = msgspec.json.Decoder(Directory)

	def convert(data):
	return msgspec.convert(data, Directory)

	return bench(data, enc.encode, dec.decode, convert)


	#############################################################################
	# pydantic V2 #
	#############################################################################


	class FileModel(pydantic.BaseModel):
	type: Literal["file"] = "file"
	name: str = pydantic.Field(min_length=1)
	created_by: uuid.UUID
	created_at: datetime.datetime
	updated_at: datetime.datetime
	nbytes: pydantic.NonNegativeInt


	class DirectoryModel(pydantic.BaseModel):
	type: Literal["directory"] = "directory"
	name: str = pydantic.Field(min_length=1)
	created_by: uuid.UUID
	created_at: datetime.datetime
	updated_at: datetime.datetime
	contents: List[Union[DirectoryModel, FileModel]]


	def bench_pydantic_v2(data):
	return bench(
	data,
	lambda p: p.model_dump_json(),
	DirectoryModel.model_validate_json,
	lambda data: DirectoryModel(**data),
	)

	#############################################################################
	# pydantic V1 #
	#############################################################################


	class FileModelV1(pydantic.v1.BaseModel):
	type: Literal["file"] = "file"
	name: str = pydantic.v1.Field(min_length=1)
	created_by: uuid.UUID
	created_at: datetime.datetime
	updated_at: datetime.datetime
	nbytes: pydantic.v1.NonNegativeInt


	class DirectoryModelV1(pydantic.v1.BaseModel):
	type: Literal["directory"] = "directory"
	name: str = pydantic.v1.Field(min_length=1)
	created_by: uuid.UUID
	created_at: datetime.datetime
	updated_at: datetime.datetime
	contents: List[Union[DirectoryModelV1, FileModelV1]]


	def bench_pydantic_v1(data):
	return bench(
	data,
	lambda p: p.json(),
	DirectoryModelV1.parse_raw,
	lambda data: DirectoryModelV1(**data),
	)


	if __name__ == "__main__":
	N = 1000
	data = make_filesystem_data(N)
	ms_dumps, ms_loads = bench_msgspec(data)
	ms_total = ms_dumps + ms_loads
	title = f"msgspec {msgspec.__version__}"
	print(title)
	print("-" * len(title))
	print(f"dumps: {ms_dumps * 1e6:.1f} us")
	print(f"loads: {ms_loads * 1e6:.1f} us")
	print(f"total: {ms_total * 1e6:.1f} us")

	for title, func in [
	(f"pydantic {pydantic.__version__}", bench_pydantic_v2),
	(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1)
	]:
	print()
	print(title)
	print("-" * len(title))
	dumps, loads = func(data)
	total = dumps + loads
	print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
	print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
	print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")