Skip to content

Instantly share code, notes, and snippets.

@zklaus
Created March 27, 2023 11:20
Show Gist options
  • Save zklaus/1310993342d7784b9ace071f4d10b468 to your computer and use it in GitHub Desktop.
Save zklaus/1310993342d7784b9ace071f4d10b468 to your computer and use it in GitHub Desktop.
Test script for compression with netcdf4
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
import sys
import traceback
from time import perf_counter as timer
from humanfriendly import format_size
from netCDF4 import Dataset
import numpy as np
import pandas as pd
logging.basicConfig(level=logging.INFO)
def create_data(shape):
rng = np.random.default_rng()
return rng.normal(300.0, 10.0, shape)
def load_data(path):
ds = Dataset(path[0])
return ds["ta"][:]
def store(data, compression, complevel):
if compression is None:
filename = "uncompressed.nc"
else:
filename = f"{compression}_{complevel}.nc"
with Dataset(filename, "w", format="NETCDF4") as root:
for i in range(data.ndim):
root.createDimension(f"dim_{i}", data.shape[i])
try:
if data.ndim == 4:
chunksizes = (10, 1, 100, 100)
else:
chunksizes = (10, 100, 100)
logging.info("Trying compression %s", compression)
v = root.createVariable(
"var",
"f4",
(f"dim_{i}" for i in range(data.ndim)),
fill_value=1.0e20,
significant_digits=5,
quantize_mode="GranularBitRound",
# quantize_mode="BitGroom",
compression=compression,
complevel=complevel,
chunksizes=chunksizes,
)
v[:] = data
success = True
except RuntimeError as e:
traceback.print_exception(e)
success = False
if success:
size = os.stat(filename).st_size
else:
size = None
return size
def main():
if len(sys.argv) > 1:
data = load_data(sys.argv[1:])
else:
data = create_data((1000, 100, 100))
info = {}
methods = (
(None, 0),
("zlib", 1),
("szip", 4),
# ("bzip2", 4), # bzip2 is too slow
("zstd", -4),
("zstd", 12),
("blosc_zstd", 4),
("blosc_zlib", 4),
)
for compression, complevel in methods:
start = timer()
size = store(data, compression, complevel)
end = timer()
time = end - start
info[f"{compression} {complevel}"] = (size, time)
info = pd.DataFrame.from_dict(info, orient="index", columns=["Filesize", "Time"])
info["Compression ratio"] = info["Filesize"][0] / info["Filesize"]
print(info.to_string(formatters={"Filesize": format_size}))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment