Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Test for comparing HDF compression libs through pandas
import os
from time import time
import pandas as pd
from memory_profiler import memory_usage
FILENAME='compressed_df'
def get_size(flnm):
return round(os.path.getsize(flnm) / (1024*1024), 2)
def store_df(original_df: pd.DataFrame, flnm: str, clib: str):
original_df.to_hdf(flnm, key='df', complib=clib, complevel=9)
def benchmark(original_df: pd.DataFrame):
res = {}
for clib in ['zlib', 'lzo', 'bzip2', 'blosc', 'blosc:blosclz', 'blosc:lz4',
'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd']:
flnm = f'{FILENAME}_{clib}.hdf'
def strdf():
return store_df(original_df, flnm, clib)
started = time()
memus = memory_usage(strdf, interval=1)
res[clib] = {'time [s]': time() - started, 'size [MB]': get_size(flnm), 'memory_usage': memus}
return res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment