-
-
Save aqrit/ef84d284cebe861d9e57db4129bcafc3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t | |
#include <string.h> // memcpy | |
#include <tmmintrin.h> // SSSE3 | |
// worst case: 17 bits per uint16_t | |
uint64_t short_compress_bound (uint32_t count) { | |
uint64_t n = count; | |
return (n << 1) + ((n + 7) >> 3); | |
} | |
const uint8_t* short_dec (uint16_t *restrict out, const uint8_t *restrict in, uint32_t count, uint16_t previous) | |
{ | |
const uint8_t *restrict data = &in[((uint64_t)count + 7) >> 3]; | |
if (count >= 64) { | |
const uint64_t kx01 = 0x0101010101010101; | |
const uint64_t kmul = kx01 | 0x80; | |
const uint64_t kx88 = kx01 * 0x88; | |
const __m128i idx = _mm_cvtsi64_si128(0x0F0E0D0C0B0A0908); | |
const __m128i roll = _mm_set1_epi16(128); | |
__m128i prev = _mm_insert_epi16(_mm_undefined_si128(), previous, 7); | |
data -= 16; | |
do { | |
uint64_t keys; | |
memcpy(&keys, in, 8); // unaligned load | |
in += 8; | |
for( int i = 0; i < 8; i++) { | |
uint64_t rank = (keys & kx01) * kmul; // prefix sum & put keys in sign bits | |
keys >>= 1; | |
data += 8 + ((rank + rank) >> 57); // length | |
__m128i shuf = _mm_unpacklo_epi8(idx, _mm_cvtsi64_si128(kx88 - rank)); // invert sign bits, get indices | |
__m128i src = _mm_loadu_si128((__m128i*)data); | |
src = _mm_shuffle_epi8(src, shuf); | |
// naive? delta decode | |
src = _mm_add_epi16(src, _mm_sub_epi16(_mm_srli_si128(prev, 14), roll)); | |
src = _mm_add_epi16(src, _mm_slli_si128(src, 2)); | |
src = _mm_add_epi16(src, _mm_slli_si128(src, 4)); | |
src = _mm_add_epi16(src, _mm_slli_si128(src, 8)); | |
prev = src; | |
_mm_storeu_si128((__m128i*)out, src); | |
out += 8; | |
} | |
count -= 64; | |
} while (count >= 64); | |
data += 16; | |
previous = _mm_extract_epi16(prev, 7); | |
} | |
// tail loop | |
if (count != 0) { | |
uint64_t keys = 0; | |
memcpy(&keys, in, (count + 7) >> 3); | |
data -= 2; | |
for (uint32_t i = 0; i < count; i++) { | |
uint64_t k = keys & 1; | |
keys >>= 1; | |
data += 1 + k; | |
uint16_t w; | |
memcpy(&w, data, 2); | |
w >>= ((k*8) ^ 8); | |
previous += (w - 128); | |
out[i] = previous; | |
} | |
data += 2; | |
} | |
return data; | |
} | |
uint8_t* short_enc (uint8_t *restrict out, const uint16_t *restrict in, uint32_t count, uint16_t previous) | |
{ | |
uint8_t *restrict data = &out[((uint64_t)count + 7) >> 3]; | |
if (count >= 64) { | |
const __m128i separate = _mm_set_epi8( | |
14, 12, 10, 8, 6, 4, 2, 0, | |
15, 13, 11, 9, 7, 5, 3, 1 | |
); | |
const __m128i roll = _mm_set1_epi16(128); | |
// bits[4:0] = index -> ((trit_d * 9) + (trit_c * 3) + (trit_b * 1) + (trit_a * 0)) | |
// bits[15:7] = popcnt | |
const __m128i sadmask = _mm_cvtsi64_si128(0x8989838381818080); | |
const __m128i neg1 = _mm_cmpeq_epi8(sadmask, sadmask); // used for bitwise-not | |
static const uint64_t table[27] = { // shuffle control indices | |
0x0000000000000001, 0x0000000000000103, 0x0000000000010203, 0x0000000000000105, | |
0x0000000000010305, 0x0000000001020305, 0x0000000000010405, 0x0000000001030405, | |
0x0000000102030405, 0x0000000000000107, 0x0000000000010307, 0x0000000001020307, | |
0x0000000000010507, 0x0000000001030507, 0x0000000102030507, 0x0000000001040507, | |
0x0000000103040507, 0x0000010203040507, 0x0000000000010607, 0x0000000001030607, | |
0x0000000102030607, 0x0000000001050607, 0x0000000103050607, 0x0000010203050607, | |
0x0000000104050607, 0x0000010304050607, 0x0001020304050607 | |
}; | |
__m128i prev = _mm_insert_epi16(_mm_undefined_si128(), previous, 7); | |
do{ | |
__m128i keys = _mm_setzero_si128(); | |
for (int i = 0; i < 8; i++) { | |
__m128i src = _mm_loadu_si128((__m128i*)in); | |
in += 8; | |
__m128i delta = _mm_sub_epi16(_mm_add_epi16(roll, src), _mm_alignr_epi8(src, prev, 14)); | |
prev = src; | |
delta = _mm_shuffle_epi8(delta, separate); | |
__m128i mask = _mm_cmpeq_epi8(_mm_setzero_si128(), delta); | |
keys = _mm_avg_epu8(keys, mask); | |
__m128i pack = _mm_or_si128(_mm_and_si128(_mm_slli_epi16(delta, 8), mask), delta); // reduce to 27 possibilities | |
uint64_t desc = _mm_cvtsi128_si64(_mm_sad_epu8(_mm_and_si128(mask, sadmask), sadmask)); // get index & popcnt | |
__m128i shuf = _mm_cvtsi64_si128(table[desc & 0x1F]); | |
_mm_storel_epi64((__m128i*)data, _mm_shuffle_epi8(pack, shuf)); | |
data += desc >> 7; | |
_mm_storeh_pi((__m64*)data, _mm_castsi128_ps(delta)); | |
data += 8; | |
} | |
_mm_storel_epi64((__m128i*)out, _mm_xor_si128(keys, neg1)); | |
out += 8; | |
count -= 64; | |
} while (count >= 64); | |
previous = _mm_extract_epi16(prev, 7); | |
} | |
// tail loop | |
if (count != 0) { | |
uint64_t keys = 0; | |
for (uint32_t i = 0; i < count; i++) { | |
uint16_t w = in[i]; | |
uint16_t d = (w + 128) - previous; | |
previous = w; | |
uint64_t k = !!(d > 255); | |
memcpy(data, &d, 2); | |
data += 1 + k; | |
keys |= k << i; | |
} | |
memcpy(out, &keys, (count + 7) >> 3); | |
} | |
return data; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment