Skip to content

Instantly share code, notes, and snippets.

@aqrit
Created July 28, 2019 23:29
Show Gist options
  • Save aqrit/ef84d284cebe861d9e57db4129bcafc3 to your computer and use it in GitHub Desktop.
Save aqrit/ef84d284cebe861d9e57db4129bcafc3 to your computer and use it in GitHub Desktop.
#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
#include <string.h> // memcpy
#include <tmmintrin.h> // SSSE3
// worst case: 17 bits per uint16_t
uint64_t short_compress_bound (uint32_t count) {
uint64_t n = count;
return (n << 1) + ((n + 7) >> 3);
}
const uint8_t* short_dec (uint16_t *restrict out, const uint8_t *restrict in, uint32_t count, uint16_t previous)
{
const uint8_t *restrict data = &in[((uint64_t)count + 7) >> 3];
if (count >= 64) {
const uint64_t kx01 = 0x0101010101010101;
const uint64_t kmul = kx01 | 0x80;
const uint64_t kx88 = kx01 * 0x88;
const __m128i idx = _mm_cvtsi64_si128(0x0F0E0D0C0B0A0908);
const __m128i roll = _mm_set1_epi16(128);
__m128i prev = _mm_insert_epi16(_mm_undefined_si128(), previous, 7);
data -= 16;
do {
uint64_t keys;
memcpy(&keys, in, 8); // unaligned load
in += 8;
for( int i = 0; i < 8; i++) {
uint64_t rank = (keys & kx01) * kmul; // prefix sum & put keys in sign bits
keys >>= 1;
data += 8 + ((rank + rank) >> 57); // length
__m128i shuf = _mm_unpacklo_epi8(idx, _mm_cvtsi64_si128(kx88 - rank)); // invert sign bits, get indices
__m128i src = _mm_loadu_si128((__m128i*)data);
src = _mm_shuffle_epi8(src, shuf);
// naive? delta decode
src = _mm_add_epi16(src, _mm_sub_epi16(_mm_srli_si128(prev, 14), roll));
src = _mm_add_epi16(src, _mm_slli_si128(src, 2));
src = _mm_add_epi16(src, _mm_slli_si128(src, 4));
src = _mm_add_epi16(src, _mm_slli_si128(src, 8));
prev = src;
_mm_storeu_si128((__m128i*)out, src);
out += 8;
}
count -= 64;
} while (count >= 64);
data += 16;
previous = _mm_extract_epi16(prev, 7);
}
// tail loop
if (count != 0) {
uint64_t keys = 0;
memcpy(&keys, in, (count + 7) >> 3);
data -= 2;
for (uint32_t i = 0; i < count; i++) {
uint64_t k = keys & 1;
keys >>= 1;
data += 1 + k;
uint16_t w;
memcpy(&w, data, 2);
w >>= ((k*8) ^ 8);
previous += (w - 128);
out[i] = previous;
}
data += 2;
}
return data;
}
uint8_t* short_enc (uint8_t *restrict out, const uint16_t *restrict in, uint32_t count, uint16_t previous)
{
uint8_t *restrict data = &out[((uint64_t)count + 7) >> 3];
if (count >= 64) {
const __m128i separate = _mm_set_epi8(
14, 12, 10, 8, 6, 4, 2, 0,
15, 13, 11, 9, 7, 5, 3, 1
);
const __m128i roll = _mm_set1_epi16(128);
// bits[4:0] = index -> ((trit_d * 9) + (trit_c * 3) + (trit_b * 1) + (trit_a * 0))
// bits[15:7] = popcnt
const __m128i sadmask = _mm_cvtsi64_si128(0x8989838381818080);
const __m128i neg1 = _mm_cmpeq_epi8(sadmask, sadmask); // used for bitwise-not
static const uint64_t table[27] = { // shuffle control indices
0x0000000000000001, 0x0000000000000103, 0x0000000000010203, 0x0000000000000105,
0x0000000000010305, 0x0000000001020305, 0x0000000000010405, 0x0000000001030405,
0x0000000102030405, 0x0000000000000107, 0x0000000000010307, 0x0000000001020307,
0x0000000000010507, 0x0000000001030507, 0x0000000102030507, 0x0000000001040507,
0x0000000103040507, 0x0000010203040507, 0x0000000000010607, 0x0000000001030607,
0x0000000102030607, 0x0000000001050607, 0x0000000103050607, 0x0000010203050607,
0x0000000104050607, 0x0000010304050607, 0x0001020304050607
};
__m128i prev = _mm_insert_epi16(_mm_undefined_si128(), previous, 7);
do{
__m128i keys = _mm_setzero_si128();
for (int i = 0; i < 8; i++) {
__m128i src = _mm_loadu_si128((__m128i*)in);
in += 8;
__m128i delta = _mm_sub_epi16(_mm_add_epi16(roll, src), _mm_alignr_epi8(src, prev, 14));
prev = src;
delta = _mm_shuffle_epi8(delta, separate);
__m128i mask = _mm_cmpeq_epi8(_mm_setzero_si128(), delta);
keys = _mm_avg_epu8(keys, mask);
__m128i pack = _mm_or_si128(_mm_and_si128(_mm_slli_epi16(delta, 8), mask), delta); // reduce to 27 possibilities
uint64_t desc = _mm_cvtsi128_si64(_mm_sad_epu8(_mm_and_si128(mask, sadmask), sadmask)); // get index & popcnt
__m128i shuf = _mm_cvtsi64_si128(table[desc & 0x1F]);
_mm_storel_epi64((__m128i*)data, _mm_shuffle_epi8(pack, shuf));
data += desc >> 7;
_mm_storeh_pi((__m64*)data, _mm_castsi128_ps(delta));
data += 8;
}
_mm_storel_epi64((__m128i*)out, _mm_xor_si128(keys, neg1));
out += 8;
count -= 64;
} while (count >= 64);
previous = _mm_extract_epi16(prev, 7);
}
// tail loop
if (count != 0) {
uint64_t keys = 0;
for (uint32_t i = 0; i < count; i++) {
uint16_t w = in[i];
uint16_t d = (w + 128) - previous;
previous = w;
uint64_t k = !!(d > 255);
memcpy(data, &d, 2);
data += 1 + k;
keys |= k << i;
}
memcpy(out, &keys, (count + 7) >> 3);
}
return data;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment