Skip to content

Instantly share code, notes, and snippets.

@matteobertozzi
Created November 19, 2023 14:14
Show Gist options
  • Save matteobertozzi/ab9fc242b75d79593bed56469e85bcf2 to your computer and use it in GitHub Desktop.
Save matteobertozzi/ab9fc242b75d79593bed56469e85bcf2 to your computer and use it in GitHub Desktop.
Encode an Array of numbers using a bitmap + int-pack4 (good for arrays with mostly zeros)
// Encode an Array of numbers using a bitmap + int-pack4
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import io.github.matteobertozzi.rednaco.bytes.encoding.IntDecoder;
import io.github.matteobertozzi.rednaco.bytes.encoding.IntEncoder;
import io.github.matteobertozzi.rednaco.bytes.encoding.IntUtil;
public final class MostlyZeroEncoder {
private MostlyZeroEncoder() {
// no-op
}
public static byte[] encode(final int[] data) throws Exception {
final int bitmapLength = (data.length + 7) >> 3;
try (ByteArrayOutputStream stream = new ByteArrayOutputStream(bitmapLength + data.length)) {
IntEncoder.LITTLE_ENDIAN.writeFixed32(stream, data.length);
writeBitmap(stream, data);
writeIntPack4(stream, data);
return stream.toByteArray();
}
}
private static void writeBitmap(final ByteArrayOutputStream stream, final int[] data) {
for (int i = 0; i < data.length;) {
int bitmapByte = 0;
for (int bitmapBitIndex = 7; i < data.length && bitmapBitIndex >= 0; ++i, --bitmapBitIndex) {
if (data[i] != 0) {
bitmapByte |= (byte)(1 << bitmapBitIndex);
}
}
stream.write(bitmapByte & 0xff);
}
}
private static void writeIntPack4(final ByteArrayOutputStream stream, final int[] data) throws Exception {
// 6 4 2 0
// | 11 | 11 | 11 | 11 |
int head = 0;
int blockCount = 0;
final int[] block = new int[4];
for (int i = 0; i < data.length; ++i) {
if (data[i] == 0) continue;
final int size = IntUtil.size(data[i]);
head |= (size - 1) << (6 - (blockCount << 1));
block[blockCount++] = data[i];
if (blockCount == 4) {
writeIntPack4(stream, head, blockCount, block);
blockCount = 0;
head = 0;
}
}
if (blockCount != 0) {
writeIntPack4(stream, head, blockCount, block);
}
}
private static void writeIntPack4(final ByteArrayOutputStream stream, final int head, final int blockCount, final int[] block) throws Exception {
stream.write(head);
for (int k = 0; k < blockCount; ++k) {
final int bytesWidth = 1 + ((head >> (6 - (k << 1))) & 3);
IntEncoder.LITTLE_ENDIAN.writeFixed(stream, block[k], bytesWidth);
}
}
public static int[] decode(final byte[] enc) throws IOException {
if (enc == null) return null;
final int length = IntDecoder.LITTLE_ENDIAN.readFixed32(enc, 0);
if (length == 0) return new int[0];
final int bitmapLength = (length + 7) >> 3;
int nonZero = 0;
for (int i = 0; i < bitmapLength; ++i) {
nonZero += Integer.bitCount(enc[4 + i] & 0xff);
}
int valueIndex = 0;
final int[] values = new int[nonZero];
int valuesOffset = 4 + bitmapLength;
while (valuesOffset < enc.length) {
final int head = enc[valuesOffset++] & 0xff;
for (int k = 0; k < 4 && valueIndex < nonZero; ++k) {
final int bytesWidth = 1 + ((head >> (6 - (k << 1))) & 3);
values[valueIndex++] = (int) IntDecoder.LITTLE_ENDIAN.readFixed(enc, valuesOffset, bytesWidth);
valuesOffset += bytesWidth;
}
}
if (valueIndex != values.length) {
throw new IllegalStateException();
}
if (valuesOffset != enc.length) {
throw new IllegalStateException();
}
valueIndex = 0;
final int[] data = new int[length];
for (int i = 0, index = 0; i < bitmapLength; ++i) {
final int bitmapByte = enc[4 + i] & 0xff;
for (int k = 7; k >= 0 && index < length; --k) {
if ((bitmapByte & (1 << k)) != 0) {
data[index++] = values[valueIndex++];
} else {
data[index++] = 0;
}
}
}
return data;
}
}
# Encode an Array of numbers using a bitmap + int-pack4
import io
def int_bytes_width(v: int) -> int:
return (v.bit_length() + 7) // 8 if v != 0 else 1
def _write_bitmap(stream: io.BytesIO, data: list[int]):
length = len(data)
index = 0
while index < length:
bitmap_byte = 0
bitmap_bit_index = 7
while index < length and bitmap_bit_index >= 0:
if data[index] != 0:
bitmap_byte |= 1 << bitmap_bit_index
bitmap_bit_index -= 1
index += 1
stream.write(bytes([bitmap_byte]))
assert index == len(data)
def _write_int_pack4_block(stream: io.BytesIO, head: int, block: list[int]):
stream.write(bytes([head]))
for k in range(len(block)):
bytes_width = 1 + ((head >> (6 - (k << 1))) & 3)
stream.write(block[k].to_bytes(bytes_width, 'little'))
def _write_int_pack4(stream: io.BytesIO, data: list[int]):
# 6 4 2 0
# | 11 | 11 | 11 | 11 |
head = 0
block = []
for v in data:
if v == 0: continue
size = int_bytes_width(v)
head |= (size - 1) << (6 - (len(block) << 1))
block.append(v)
if len(block) == 4:
_write_int_pack4_block(stream, head, block)
block.clear()
head = 0
if block:
_write_int_pack4_block(stream, head, block)
def data_encode(data: list[int]) -> bytes:
stream = io.BytesIO()
stream.write(len(data).to_bytes(4, 'little'))
_write_bitmap(stream, data)
_write_int_pack4(stream, data)
return stream.getvalue()
def data_decode(enc: bytes) -> list[int]:
length = int.from_bytes(enc[0:4], 'little')
if length == 0: return []
bitmap_length = ((length + 7) >> 3)
non_zero = 0
for i in range(bitmap_length):
non_zero += (enc[4 + i] & 0xff).bit_count()
values = []
values_offset = 4 + bitmap_length
eof_offset = len(enc)
while values_offset < eof_offset:
head = enc[values_offset] & 0xff
values_offset += 1
for k in range(4):
if len(values) >= non_zero:
break
bytes_width = 1 + ((head >> (6 - (k << 1))) & 3)
values.append(int.from_bytes(enc[values_offset:values_offset+bytes_width], 'little'))
values_offset += bytes_width
assert len(values) == non_zero
assert values_offset == eof_offset
data = []
value_index = 0;
for i in range(bitmap_length):
bitmap_byte = enc[4 + i] & 0xff
for k in range(7, -1, -1):
if len(data) >= length:
break
if ((bitmap_byte & (1 << k)) != 0):
data.append(values[value_index])
value_index += 1
else:
data.append(0)
assert len(data) == length
assert value_index == non_zero
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment