Skip to content

Instantly share code, notes, and snippets.

@bbhavsar
Created May 6, 2020 16:00
Show Gist options
  • Save bbhavsar/14bf2f0e87d681492b0249839f631e57 to your computer and use it in GitHub Desktop.
Save bbhavsar/14bf2f0e87d681492b0249839f631e57 to your computer and use it in GitHub Desktop.
# Modification of codec-test.py from Todd Lipcon to be python3 compatible and some formatting of output.
# https://github.infra.cloudera.com/raw/todd/experiments/master/kudu/codec-test.py
import pyfastpfor
import numpy as np
import pandas as pd
from timeit import Timer
import bitshuffle
import sys
from prettytable import PrettyTable
ints = pd.read_csv(sys.argv[1], dtype=np.uint32).values.ravel()
tmp_buf = np.empty_like(ints)
comp = np.zeros(len(ints) * 2 + 1024, dtype=np.uint32, order='C')
class Bitshuffle(object):
def __init__(self):
pass
def compress(self):
ret = bitshuffle.compress_lz4(ints)
return ret
def decomp(self, compressed_data):
return bitshuffle.decompress_lz4(compressed_data, ints.shape, ints.dtype)
class Codec(object):
def __init__(self, codec, for_pagesize=None):
self.codec = codec
self.for_pagesize = for_pagesize
def compress(self):
if self.for_pagesize:
src_buf = tmp_buf
i = 1
for page_start in range(0, len(ints), self.for_pagesize):
page = ints[page_start:page_start + self.for_pagesize]
comp[i] = page.min()
src_buf[page_start:page_start + self.for_pagesize] = page - comp[i]
i += 1
comp[0] = i - 1
header_size = i
else:
src_buf = ints
header_size = 0
enc_dst = comp[header_size:]
enc_len = self.codec.encodeArray(src_buf, len(src_buf), enc_dst, len(enc_dst))
return comp[0:enc_len + header_size]
def decomp(self, comp_buf):
if self.for_pagesize:
num_pages = comp_buf[0]
mins = comp_buf[1:1 + num_pages]
enc_buf = comp_buf[1 + num_pages:]
else:
enc_buf = comp_buf
dec_len = self.codec.decodeArray(enc_buf, len(enc_buf), ints, len(ints))
if self.for_pagesize:
for i in range(num_pages):
ints[i * self.for_pagesize:i * self.for_pagesize + self.for_pagesize] += mins[i]
return ints
codecs = {}
codecs['bitshuffle'] = Bitshuffle()
for codec_name in pyfastpfor.getCodecList():
# This codec seems to crash on some data
if '8b_rle' in codec_name: continue
if 'optpfor' in codec_name: continue
if codec_name.startswith('simdbinarypacking'):
codecs[codec_name] = Codec(pyfastpfor.getCodec(codec_name))
codecs[codec_name + "_for128"] = Codec(pyfastpfor.getCodec(codec_name), for_pagesize=128)
codecs[codec_name + "_for256"] = Codec(pyfastpfor.getCodec(codec_name), for_pagesize=256)
# float_format doesn't seem to be working
t = PrettyTable(['codec', 'comp_time', 'decom_time', 'bits_per_int'], float_format="4.4")
t.align='l'
for codec_name, codec in sorted(codecs.items())[:40]:
try:
compressed_data = codec.compress()
compressed_bytes = len(compressed_data) * compressed_data.dtype.itemsize
bits_per_int = float(compressed_bytes) / len(ints) * 8
r = [codec_name, Timer(codec.compress).timeit(number=10) * 1000, Timer(lambda: codec.decomp(compressed_data)).timeit(number=10) * 1000, bits_per_int]
t.add_row(r)
except Exception as e:
print(codec_name, "FAIL", e)
print(t)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment