Skip to content

Instantly share code, notes, and snippets.

@jd
Last active November 11, 2021 13:30
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save jd/b0aa5cbfa42f4eb23eb9 to your computer and use it in GitHub Desktop.
Save jd/b0aa5cbfa42f4eb23eb9 to your computer and use it in GitHub Desktop.
# -*- encoding: utf-8 -*-
#
# Copyright © 2016 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import struct
import bitarray
def double_to_int(f):
"""Convert a 64 bits float to a 64 bits integer."""
return struct.unpack("!Q", struct.pack("!d", f))[0]
def int_to_double(i):
"""Convert an integer to a 64 bits float."""
return struct.unpack("!d", struct.pack("!Q", i))[0]
def to_binary64(v):
"""Convert value to a binary representation on 64 bits as a string."""
return "{:064b}".format(v)
def count_lead_and_trail_zeroes(d):
"""Count the number of leading and trailing zeroes in an integer."""
as_str = "{:64b}".format(d)
try:
leading_zeroes = as_str.index("1")
trailing_zeroes = 63 - as_str.rindex("1")
except ValueError:
return 64, 64
return leading_zeroes, trailing_zeroes
def encode_xor_floats(floats):
"""Encode a list of floats using XOR encoding.
Encoding stolen from paper:
"Gorilla: A Fast, Scalable, In-Memory Time Series Database"
"""
if len(floats) == 0:
return b""
prev = double_to_int(floats[0])
encoded = bitarray.bitarray(to_binary64(prev))
prev_lz = prev_tz = 0
for v in floats[1:]:
v = double_to_int(v)
xor = prev ^ v
if xor == 0:
encoded.append(0)
else:
encoded.append(1)
lz, tz = count_lead_and_trail_zeroes(xor)
if lz >= 32:
lz = 31
if prev_lz != 0 and lz >= prev_lz and tz >= prev_tz:
encoded.append(0)
encoded.extend(
to_binary64(xor >> prev_tz)[- (64 - prev_lz - prev_tz):])
else:
prev_lz = lz
prev_tz = tz
encoded.append(1)
encoded.extend(to_binary64(lz)[-5:])
sigbits = 64 - lz - tz
encoded.extend(to_binary64(sigbits)[-6:])
encoded.extend(to_binary64(xor >> tz)[-sigbits:])
prev = v
return encoded.tobytes()
_ARRAY_3_ZERO = bitarray.bitarray('000')
_ARRAY_2_ZERO = bitarray.bitarray('00')
def decode_xor_floats(encoded, number_of_values):
if number_of_values == 0:
return []
array = bitarray.bitarray()
array.frombytes(encoded.tobytes())
results = []
# Read first value
results.append(struct.unpack("!d", array[:64].tobytes())[0])
index = 64
for _ in range(number_of_values - 1):
index += 1
if array[index - 1]:
index += 1
if array[index - 1]:
bits = _ARRAY_3_ZERO + array[index:index + 5]
leading = struct.unpack("!B", bits.tobytes())[0]
bits = _ARRAY_2_ZERO + array[index + 5:index + 11]
trailing = 64 - leading - struct.unpack(
"!B", bits.tobytes())[0]
index += 11
sigbits = 64 - leading - trailing
bits = bitarray.bitarray(
'0' * (64 - sigbits)) + array[index:index + sigbits]
num = struct.unpack("!Q", bits.tobytes())[0]
index += sigbits
vbits = double_to_int(results[-1])
vbits ^= num << trailing
results.append(int_to_double(vbits))
else:
results.append(results[-1])
return results
@BastianHavers
Copy link

Hello!
I am playing around with different compression schemes and happened upon your Python implementation of the Gorilla float compression. Nice that you made one :-)

I think I found a minor bug:

If the number of significant bits is 64 (the maximum), in line 82 it will be encoded as '000000' ('1000000' = 64, and then only the last 6 bits).
In line 114, where you decode the number of significant bits, this will then be read as 0 (zero) significant bits, so the decoding will be faulty.
This happened to me during my testing, but not on all files 64 significant bits are encountered, so it can be hard to spot.

During decoding, one should use
sigbits = 64 if sigbits == 0 else sigbits
e.g. by inserting it in line 117 (or maybe there is a more elegant way...).

Cheers!

@jd
Copy link
Author

jd commented Jun 7, 2021

Thank you @BastianHavers!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment