Skip to content

Instantly share code, notes, and snippets.

@frobnitzem
Last active June 28, 2024 19:50
Show Gist options
  • Save frobnitzem/6b38e562cf751938c5c1e2c67402f87c to your computer and use it in GitHub Desktop.
Save frobnitzem/6b38e562cf751938c5c1e2c67402f87c to your computer and use it in GitHub Desktop.
Serialized tensor sizes.
# What are the size overheads for serializing tensors?
#
import io
import sys

import numpy as np

# https://huggingface.co/docs/safetensors/index
#from safetensors.torch import save_file
# https://huggingface.co/docs/safetensors/api/numpy
from safetensors.numpy import save

nx, ny, nz = map(int, sys.argv[1:4])
print(nx, ny, nz)

x2 = (np.arange(nx)*4/nx)**2
y2 = (np.arange(ny)*5/ny)**2
z2 = (np.arange(nz)*5/nz)**2
A = 10000
x = (A*np.exp(-0.5*(x2[:,None] + y2[None,:]))).astype('float32')
y = (A*np.exp(-0.5*(x2[:,None] + z2[None,:]))).astype('int16')
print(x.nbytes + y.nbytes)
print(x.nbytes + (y != 0).sum()*16//8) # bytes in nonzeros

data = {"x": x, "y": y}

with io.BytesIO() as f:
    np.savez(f, data)
    sz = f.getbuffer().nbytes
print(f"np.savez: {sz}")

sz = len(save(data))
print(f"safetensors: {sz}")

import torch
tdata = {"x": torch.tensor(x), "y": torch.tensor(y)}
print(tdata["x"].nbytes+tdata["y"].nbytes)
with io.BytesIO() as f:
    torch.save(tdata, f)
    sz = f.getbuffer().nbytes
print(f"torch.save: {sz}")

import h5py
# https://docs.h5py.org/en/stable/high/dataset.html#shuffle-filter
#compression = "gzip"
compression = "lzf"
with io.BytesIO() as f:
    with h5py.File(f, "w") as h5:
        for k, v in data.items():
            h5.create_dataset(k, data=v,
                              compression=compression,
                              shuffle=True)
    sz = f.getbuffer().nbytes
print(f"hdf5: {sz}")

import zfpy # github.com/llnl/zfp
# Doesn't name tensors or accept int16, but that's OK.
# We add a header size to be fair.
sz = len( zfpy.compress_numpy(x, write_header=True) ) \
   + len( zfpy.compress_numpy(y.astype('int32'), write_header=True) ) \
   + len(b'{"x":____,"y": ____}')
print(f"zfpy: {sz}")
% python3 sizes.py 100 1000 1
100 1000 1
400200
400200
np.savez: 400802
safetensors: 400344
400200
torch.save: 401560
hdf5: 280551
zfpy: 195340

% python3 sizes.py 50 50 30
50 50 30
13000
12192
np.savez: 13604
safetensors: 13136
13000
torch.save: 14360
hdf5: 16784
zfpy: 8356

% python3 sizes.py 500 500 30
500 500 30
1030000
1021746
np.savez: 1030607
safetensors: 1030144
1030000
torch.save: 1031384
hdf5: 704563
zfpy: 418020
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment