Created
May 6, 2020 16:00
-
-
Save bbhavsar/14bf2f0e87d681492b0249839f631e57 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Modification of codec-test.py from Todd Lipcon to be python3 compatible and some formatting of output. | |
# https://github.infra.cloudera.com/raw/todd/experiments/master/kudu/codec-test.py | |
import pyfastpfor | |
import numpy as np | |
import pandas as pd | |
from timeit import Timer | |
import bitshuffle | |
import sys | |
from prettytable import PrettyTable | |
ints = pd.read_csv(sys.argv[1], dtype=np.uint32).values.ravel() | |
tmp_buf = np.empty_like(ints) | |
comp = np.zeros(len(ints) * 2 + 1024, dtype=np.uint32, order='C') | |
class Bitshuffle(object): | |
def __init__(self): | |
pass | |
def compress(self): | |
ret = bitshuffle.compress_lz4(ints) | |
return ret | |
def decomp(self, compressed_data): | |
return bitshuffle.decompress_lz4(compressed_data, ints.shape, ints.dtype) | |
class Codec(object): | |
def __init__(self, codec, for_pagesize=None): | |
self.codec = codec | |
self.for_pagesize = for_pagesize | |
def compress(self): | |
if self.for_pagesize: | |
src_buf = tmp_buf | |
i = 1 | |
for page_start in range(0, len(ints), self.for_pagesize): | |
page = ints[page_start:page_start + self.for_pagesize] | |
comp[i] = page.min() | |
src_buf[page_start:page_start + self.for_pagesize] = page - comp[i] | |
i += 1 | |
comp[0] = i - 1 | |
header_size = i | |
else: | |
src_buf = ints | |
header_size = 0 | |
enc_dst = comp[header_size:] | |
enc_len = self.codec.encodeArray(src_buf, len(src_buf), enc_dst, len(enc_dst)) | |
return comp[0:enc_len + header_size] | |
def decomp(self, comp_buf): | |
if self.for_pagesize: | |
num_pages = comp_buf[0] | |
mins = comp_buf[1:1 + num_pages] | |
enc_buf = comp_buf[1 + num_pages:] | |
else: | |
enc_buf = comp_buf | |
dec_len = self.codec.decodeArray(enc_buf, len(enc_buf), ints, len(ints)) | |
if self.for_pagesize: | |
for i in range(num_pages): | |
ints[i * self.for_pagesize:i * self.for_pagesize + self.for_pagesize] += mins[i] | |
return ints | |
codecs = {} | |
codecs['bitshuffle'] = Bitshuffle() | |
for codec_name in pyfastpfor.getCodecList(): | |
# This codec seems to crash on some data | |
if '8b_rle' in codec_name: continue | |
if 'optpfor' in codec_name: continue | |
if codec_name.startswith('simdbinarypacking'): | |
codecs[codec_name] = Codec(pyfastpfor.getCodec(codec_name)) | |
codecs[codec_name + "_for128"] = Codec(pyfastpfor.getCodec(codec_name), for_pagesize=128) | |
codecs[codec_name + "_for256"] = Codec(pyfastpfor.getCodec(codec_name), for_pagesize=256) | |
# float_format doesn't seem to be working | |
t = PrettyTable(['codec', 'comp_time', 'decom_time', 'bits_per_int'], float_format="4.4") | |
t.align='l' | |
for codec_name, codec in sorted(codecs.items())[:40]: | |
try: | |
compressed_data = codec.compress() | |
compressed_bytes = len(compressed_data) * compressed_data.dtype.itemsize | |
bits_per_int = float(compressed_bytes) / len(ints) * 8 | |
r = [codec_name, Timer(codec.compress).timeit(number=10) * 1000, Timer(lambda: codec.decomp(compressed_data)).timeit(number=10) * 1000, bits_per_int] | |
t.add_row(r) | |
except Exception as e: | |
print(codec_name, "FAIL", e) | |
print(t) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment