Skip to content

Instantly share code, notes, and snippets.

@aqrit
Last active April 10, 2020 00:32
Show Gist options
  • Save aqrit/f666e05362bc75376ac59a34879fad4f to your computer and use it in GitHub Desktop.
Save aqrit/f666e05362bc75376ac59a34879fad4f to your computer and use it in GitHub Desktop.
#include <stddef.h>
#include <stdint.h>
/// flags
#define SVB_NORMAL 0
#define SVB_SPECIALCASEZERO 1
#define SVB_DELTAASCENDING 2
#define SVB_DELTADESCENDING 4
#define SVB_VARSIGNEXTEND 8
#define SVB_ZIGZAG 16
/*
SVB_NORMAL:
| key | cb | ranges | level shifted ranges |
|------|----|---------------|------------------------|
| 0b00 | 1 | 0..0x000000FF | 0xFFFFFF80..0x0000007F |
| 0b01 | 2 | 0..0x0000FFFF | 0xFFFF8000..0x00007FFF |
| 0b10 | 3 | 0..0x00FFFFFF | 0xFF800000..0x007FFFFF |
| 0b11 | 4 | 0..0xFFFFFFFF | 0x80000000..0x7FFFFFFF |
SVB_SPECIALCASEZERO:
| key | cb | ranges | level shifted ranges |
|------|----|---------------|------------------------|
| 0b00 | 0 | 0..0x00000000 | 0x00000000..0x00000000 |
| 0b01 | 1 | 0..0x000000FF | 0xFFFFFF80..0x0000007F |
| 0b10 | 2 | 0..0x0000FFFF | 0xFFFF8000..0x00007FFF |
| 0b11 | 4 | 0..0xFFFFFFFF | 0x80000000..0x7FFFFFFF |
SVB_DELTAASCENDING:
delta encoding for input that is mostly ordered
from the smallest to the largest number
SVB_DELTADESCENDING:
delta encoding for input that is mostly ordered
from the largest to the smallest number
SVB_VARSIGNEXTEND:
level shifted the ranges
(similar to zigzag encoding but it uses a different strategy)
SVB_ZIGZAG:
level shifted the ranges
*/
/**
* STREAMVBYTE_ENCODE...
*
* Macro Pitfalls:
* All [in,out] arguments must NOT be an expression.
* `flags` should be an integer constant expression.
*
* @param [in,out] dest - uint8_t array
* @param [in,out] src - const uint32_t array
* @param [in] count - number of elements in src array
* @param [in] flags
* @param [in,out] prev - u32 base to start delta ops (if delta flag)
*/
#define STREAMVBYTE_ENCODE(dest, src, count, flags, prev) \
do { \
uint8_t* key_ptr = dest; \
size_t cnt = (count); \
const uint32_t* end; \
dest += SVB_KEY_BLOCK_SIZE(cnt); /* start of data block */ \
\
/* encode 4 elements and produce 1 key byte per iteration */ \
for (end = &src[(cnt | 3) ^ 3]; src != end; src += 4) { \
uint32_t v0, v1, v2, v3; /* values */ \
uint32_t k0, k1, k2, k3; /* keys */ \
uint32_t l0, l1, l2, l3; /* lengths */ \
\
if (SVB_DELTAASCENDING & (flags)) { \
v0 = src[0] - prev; \
v1 = src[1] - src[0]; \
v2 = src[2] - src[1]; \
v3 = src[3] - src[2]; \
prev = src[3]; \
} else { \
if (SVB_DELTADESCENDING & (flags)) { \
v0 = prev; \
v1 = src[0]; \
v2 = src[1]; \
v3 = src[2]; \
prev = src[3]; \
\
v0 -= v1; \
v1 -= v2; \
v2 -= v3; \
v3 -= prev; \
} else { \
v0 = src[0]; \
v1 = src[1]; \
v2 = src[2]; \
v3 = src[3]; \
} \
} \
\
if (SVB_ZIGZAG & (flags)) { \
v0 = (v0 + v0) ^ -(v0 >> 31); \
v1 = (v1 + v1) ^ -(v1 >> 31); \
v2 = (v2 + v2) ^ -(v2 >> 31); \
v3 = (v3 + v3) ^ -(v3 >> 31); \
} \
k0 = v0; \
k1 = v1; \
k2 = v2; \
k3 = v3; \
\
if (SVB_VARSIGNEXTEND & (flags)) { \
/* find where the "extended" sign bits can be truncated */ \
/* aka. find the bit flip nearest to the MSB */ \
/* aka. the first part of __builtin_clrsb() */ \
k0 ^= k0 + k0; \
k1 ^= k1 + k1; \
k2 ^= k2 + k2; \
k3 ^= k3 + k3; \
} \
\
if (SVB_SPECIALCASEZERO & (flags)) { \
/* smear left... before SVB_KEY op */ \
k0 |= (k0 << 8); \
k1 |= (k1 << 8); \
k2 |= (k2 << 8); \
k3 |= (k3 << 8); \
} \
SVB_KEY(k0); \
SVB_KEY(k1); \
SVB_KEY(k2); \
SVB_KEY(k3); \
if (SVB_SPECIALCASEZERO & (flags)) { \
l0 = k0 + (k0 >= 3); /* (k*5 + 1) >> 2 */ \
l1 = k1 + (k1 >= 3); /* ((k >> 1) & k) + k */ \
l2 = k2 + (k2 >= 3); \
l3 = k3 + (k3 >= 3); \
} else { \
l0 = k0 + 1; \
l1 = k1 + 1; \
l2 = k2 + 1; \
l3 = k3 + 1; \
} \
\
SVB_PUT_LE32(dest, v0); \
dest += l0; \
SVB_PUT_LE32(dest, v1); \
dest += l1; \
SVB_PUT_LE32(dest, v2); \
dest += l2; \
SVB_PUT_LE32(dest, v3); \
dest += l3; \
\
*key_ptr++ = (k3 << 6) | (k2 << 4) | ((k1 * 4) + k0); \
} \
\
/* tail loop */ \
cnt &= 3; \
if (cnt != 0) { \
uint32_t key = 0; \
uint32_t shift = 0; \
end = &src[cnt]; \
do { \
uint32_t v; \
if (SVB_DELTAASCENDING & (flags)) { \
v = src[0] - prev; \
prev = src[0]; \
} else { \
if (SVB_DELTADESCENDING & (flags)) { \
v = prev; \
prev = src[0]; \
v -= prev; \
} else { \
v = src[0]; \
} \
} \
if (SVB_ZIGZAG & (flags)) v = (v + v) ^ -(v >> 31); \
SVB_PUT_LE32(dest, v); \
if (SVB_VARSIGNEXTEND & (flags)) v ^= v + v; \
if (SVB_SPECIALCASEZERO & (flags)) v |= (v << 8); \
SVB_KEY(v); \
if (SVB_SPECIALCASEZERO & (flags)) { \
dest += v + (v >= 3); \
} else { \
dest += v + 1; \
} \
key |= (v << shift); \
shift += 2; \
} while (++src != end); \
*key_ptr = key; \
} \
} while (0)
/**
* STREAMVBYTE_DECODE...
*
* Macro Pitfalls:
* All [in,out] arguments must NOT be an expression.
* `flags` should be an integer constant expression.
*
* @param [in,out] dest - uint32_t array
* @param [in,out] src - const uint8_t array
* @param [in] count - number of elements encoded into src stream
* @param [in] flags
* @param [in,out] prev - u32 base to start delta ops (if delta flag)
*/
#define STREAMVBYTE_DECODE(dest, src, count, flags, prev) \
do { \
const uint8_t* key_ptr = src; \
size_t cnt = (count); \
src += SVB_KEY_BLOCK_SIZE(cnt); /* start of data block */ \
\
if (cnt > 12) { /* use key chunk as padding */ \
const uint32_t* end = &dest[(cnt | 3) ^ 3]; \
src -= 4; \
do { \
const uint32_t mask = 0x401041; \
uint32_t v0, v1, v2, v3; /* values */ \
uint32_t s0, s1, s2, s3; /* shift amounts */ \
uint32_t l0, l1, l2, l3; /* lengths */ \
uint32_t k = *key_ptr++; /* key bits */ \
\
/* calc packed sizes and shift amounts (using swar) */ \
if (SVB_SPECIALCASEZERO & (flags)) { \
if (SVB_VARSIGNEXTEND & (flags)) { \
k = (k * 0x00101004) & mask*12; \
k = k*5 + mask*4; \
k = ((k + mask*4) & mask*24) | ((k >> 4) & mask*7);\
} else { \
k = k * 0x00040401 & mask*3; \
k = (k*9 + mask*5 ) & mask*0x38; \
k = (mask*32 - k) | (k >> 3); \
} \
} else { \
k = k * 0x00040401 & mask*3; \
k = (k*9 + mask) ^ mask*24; \
} \
\
/* extract */ \
l0 = k & 7; \
l1 = (k >> 12) & 7; \
l2 = (k >> 22) & 7; \
l3 = (k >> 6) & 7; \
s0 = k & 0x38; \
s1 = (k >> 12) & 0x38; \
s2 = (k >> 22) & 0x38; \
s3 = (k >> 6) & 0x38; \
\
/* overlapping reads */ \
src += l0; \
SVB_GET_LE32(src, v0); \
src += l1; \
SVB_GET_LE32(src, v1); \
src += l2; \
SVB_GET_LE32(src, v2); \
src += l3; \
SVB_GET_LE32(src, v3); \
\
if (SVB_SPECIALCASEZERO & (flags)) { \
if (SVB_VARSIGNEXTEND & (flags)) { \
/* if length is zero then zero the upper bits */ \
v0 &= -l0 | 7; \
v1 &= -l1 | 7; \
v2 &= -l2 | 7; \
v3 &= -l3 | 7; \
} \
} \
\
/* shift-off unwanted bytes */ \
if (SVB_VARSIGNEXTEND & (flags)) { \
/* implementation-defined behavior (sticky-sign-bit) */ \
v0 = (int32_t)v0 >> s0; \
v1 = (int32_t)v1 >> s1; \
v2 = (int32_t)v2 >> s2; \
v3 = (int32_t)v3 >> s3; \
} else { \
if (SVB_SPECIALCASEZERO & (flags)) { \
/* may shift right by 32 to get a zero value */ \
v0 = (uint64_t)v0 >> s0; \
v1 = (uint64_t)v1 >> s1; \
v2 = (uint64_t)v2 >> s2; \
v3 = (uint64_t)v3 >> s3; \
} else { \
v0 >>= s0; \
v1 >>= s1; \
v2 >>= s2; \
v3 >>= s3; \
} \
} \
\
/* recover */ \
if (SVB_ZIGZAG & (flags)) { \
v0 = (v0 >> 1) ^ -(v0 & 1); \
v1 = (v1 >> 1) ^ -(v1 & 1); \
v2 = (v2 >> 1) ^ -(v2 & 1); \
v3 = (v3 >> 1) ^ -(v3 & 1); \
} \
if (SVB_DELTAASCENDING & (flags)) { \
v0 += prev; \
v1 += v0; \
v2 += v1; \
v3 += v2; \
prev = v3; \
} else { \
if (SVB_DELTADESCENDING & (flags)) { \
prev -= v0; v0 = prev; \
prev -= v1; v1 = prev; \
prev -= v2; v2 = prev; \
prev -= v3; v3 = prev; \
} \
} \
\
/* store */ \
dest[0] = v0; \
dest[1] = v1; \
dest[2] = v2; \
dest[3] = v3; \
dest += 4; \
} while (dest != end); \
\
cnt &= 3; \
src += 4; \
} \
\
/* tail loop */ \
if (cnt != 0) { \
uint32_t keys = key_ptr[0]; \
if (cnt > 4) { \
keys |= key_ptr[1] << 8; \
if (cnt > 8) { \
keys |= key_ptr[2] << 16; \
} \
} \
do { \
uint32_t v; \
uint32_t k = keys & 3; \
keys >>= 2; \
if (SVB_SPECIALCASEZERO & (flags)) { \
if (SVB_VARSIGNEXTEND & (flags)) { \
switch (k) { \
case 3: \
SVB_GET_LE32(src, v); \
src += 4; \
break; \
case 2: \
SVB_GET_LE16(src, v); \
v = (int16_t)v; \
src += 2; \
break; \
case 1: \
v = (int8_t)*src++; \
break; \
case 0: v = 0; \
} \
} else { \
switch (k) { \
case 3: \
SVB_GET_LE32(src, v); \
src += 4; \
break; \
case 2: \
SVB_GET_LE16(src, v); \
src += 2; \
break; \
case 1: \
v = *src++; \
break; \
case 0: \
v = 0; \
} \
} \
} else { \
if (SVB_VARSIGNEXTEND & (flags)) { \
switch (k) { \
case 3: \
SVB_GET_LE32(src, v); \
src += 4; \
break; \
case 2: \
SVB_GET_LE32(src - 1, v); \
v = (int32_t)v >> 8; \
src += 3; \
break; \
case 1: \
SVB_GET_LE16(src, v); \
v = (int16_t)v; \
src += 2; \
break; \
case 0: \
v = (int8_t)*src++; \
} \
} else { \
switch (k) { \
case 3: \
SVB_GET_LE32(src, v); \
src += 4; \
break; \
case 2: \
SVB_GET_LE32(src - 1, v); \
v >>= 8; \
src += 3; \
break; \
case 1: \
SVB_GET_LE16(src, v); \
src += 2; \
break; \
case 0: \
v = *src++; \
} \
} \
} \
\
if (SVB_ZIGZAG & (flags)) v = (v >> 1) ^ -(v & 1); \
\
if (SVB_DELTAASCENDING & (flags)) { \
prev += v; v = prev; \
} else { \
if (SVB_DELTADESCENDING & (flags)) { \
prev -= v; v = prev; \
} \
} \
\
*dest++ = v; \
} while(--cnt); \
} \
} while (0)
/**
* Every encoded element is represented by 2-bit key. All keys are
* packed into a block that precedes the variable sized data block.
* The key block is padded out to a byte boundry.
*/
#define SVB_KEY_BLOCK_SIZE(count) ((count >> 2) + ((count & 3) != 0))
/**
* Return the index of the highest set byte.
* (If no bytes are set then return zero)
*
* If the cpu supports a FAST way to count leading zero bits
* then it should be used. Elsewise emulating a __builtin_clz()
* operations is slower than necessary because only byte precision
* is required.
*
* Note: BSR is slow on AMD, LZCNT requires BMI1, and who knows w/ARM.
*/
#if HAVE_FAST_CLZ && defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
# define SVB_KEY(v) \
v = (((uint32_t)__builtin_clz(v | 1) ^ 31) >> 3)
#else
# define SVB_KEY(v) \
v >>= 8; \
v = ((((v + 0xFFFF00) | 0xFFFFFF) + v) | (v + 0xFF0000)) >> 24
#endif
/// Load 16-bits from an unaligned location in little endian byte order
#define SVB_GET_LE16(ptr, v) \
do { \
uint8_t* p_ = (uint8_t*)(ptr); \
uint16_t v_ = (((uint16_t)p_[0])) | (((uint16_t)p_[1]) << 8); \
(v) = v_; \
} while (0)
/// Load 32-bits from an unaligned location in little endian byte order
#define SVB_GET_LE32(ptr, v) \
do { \
uint8_t* p_ = (uint8_t*)(ptr); \
uint32_t v_ = \
(((uint32_t)p_[0])) | \
(((uint32_t)p_[1]) << 8) | \
(((uint32_t)p_[2]) << 16) | \
(((uint32_t)p_[3]) << 24); \
(v) = v_; \
} while (0)
/// Store 16-bits to an unaligned location in little endian byte order
#define SVB_PUT_LE16(ptr, v) \
do { \
uint16_t v_ = (v); \
uint8_t* p_ = (uint8_t*)(ptr); \
p_[0] = v_; \
p_[1] = v_ >> 8; \
} while (0)
/// Store 32-bits to an unaligned location in little endian byte order
#define SVB_PUT_LE32(ptr, v) \
do { \
uint32_t v_ = (v); \
uint8_t* p_ = (uint8_t*)(ptr); \
p_[0] = v_; \
p_[1] = v_ >> 8; \
p_[2] = v_ >> 16; \
p_[3] = v_ >> 24; \
} while (0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment