-
-
Save aqrit/8fb615a05586d023e07cbd997cfcb6f9 to your computer and use it in GitHub Desktop.
streamvbyte unrolled encoder trial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <inttypes.h> // PRIu64 | |
#include <smmintrin.h> // sse41 | |
#include "streamvbyte_shuffle_tables.h" | |
#define RDTSC_START(cycles) \ | |
do{ \ | |
register unsigned cyc_high, cyc_low; \ | |
__asm volatile("cpuid\n\t" \ | |
"rdtsc\n\t" \ | |
"cpuid\n\t" \ | |
"rdtsc\n\t" \ | |
"cpuid\n\t" \ | |
"rdtsc\n\t" \ | |
"mov %%edx, %0\n\t" \ | |
"mov %%eax, %1\n\t" \ | |
: "=r"(cyc_high), "=r"(cyc_low) \ | |
::"%rax", "%rbx", "%rcx", "%rdx", "memory"); \ | |
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
}while(0) | |
#define RDTSC_FINAL(cycles) \ | |
do{ \ | |
register unsigned cyc_high, cyc_low; \ | |
__asm volatile("rdtscp\n\t" \ | |
"mov %%edx, %0\n\t" \ | |
"mov %%eax, %1\n\t" \ | |
"cpuid\n\t" \ | |
: "=r"(cyc_high), "=r"(cyc_low) \ | |
::"%rax", "%rbx", "%rcx", "%rdx", "memory"); \ | |
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
}while(0) | |
size_t svb_encode16(uint32_t *in, uint32_t count, uint8_t *out) { | |
uint8_t *restrict keyPtr = out; | |
uint32_t keyLen = (count + 3) >> 2; // 2-bits rounded to full byte | |
uint8_t *restrict dataPtr = keyPtr + keyLen; // variable lenght byte data after all keys | |
const __m128i zero = _mm_setzero_si128(); | |
const __m128i hash = _mm_set1_epi32(0x7F7F7F7F); | |
const __m128i translate = _mm_set_epi32(0x01000000,0,0,0x0203); | |
const __m128i pack = _mm_set1_epi32(0x10400104); | |
const __m128i hsum = _mm_set1_epi32(0x01010101); | |
for(uint32_t* end = &in[(count & ~15)]; in != end; in += 16){ | |
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); | |
__m128i in1 = _mm_loadu_si128((__m128i*)&in[4]); | |
__m128i in2 = _mm_loadu_si128((__m128i*)&in[8]); | |
__m128i in3 = _mm_loadu_si128((__m128i*)&in[12]); | |
__m128i t0 = _mm_cmpeq_epi8(zero, in0); | |
__m128i t1 = _mm_cmpeq_epi8(zero, in1); | |
__m128i t2 = _mm_cmpeq_epi8(zero, in2); | |
__m128i t3 = _mm_cmpeq_epi8(zero, in3); | |
t0 = _mm_packs_epi16(t0, t1); | |
t2 = _mm_packs_epi16(t2, t3); | |
t0 = _mm_srai_epi16(t0, 7); | |
t2 = _mm_srai_epi16(t2, 7); | |
t0 = _mm_packs_epi16(t0, t2); | |
t0 = _mm_subs_epu8(t0, hash); | |
t1 = _mm_shuffle_epi8(translate, t0); | |
t2 = _mm_madd_epi16(t1, pack); | |
t1 = _mm_madd_epi16(t1, hsum); | |
const size_t key0 = (size_t)_mm_extract_epi8(t2, 1); | |
const size_t key1 = (size_t)_mm_extract_epi8(t2, 5); | |
const size_t key2 = (size_t)_mm_extract_epi8(t2, 9); | |
const size_t key3 = (size_t)_mm_extract_epi8(t2, 13); | |
const uint8_t* tbl = (uint8_t*)encodingShuffleTable; | |
const uint8_t* shuf0 = tbl + key0 * 16; | |
const uint8_t* shuf1 = tbl + key1 * 16; | |
const uint8_t* shuf2 = tbl + key2 * 16; | |
const uint8_t* shuf3 = tbl + key3 * 16; | |
const size_t len0 = 4 + (size_t)_mm_extract_epi8(t1, 1); | |
const size_t len1 = 4 + (size_t)_mm_extract_epi8(t1, 5); | |
const size_t len2 = 4 + (size_t)_mm_extract_epi8(t1, 9); | |
const size_t len3 = 4 + (size_t)_mm_extract_epi8(t1, 13); | |
_mm_storeu_si128((__m128i *)dataPtr, _mm_shuffle_epi8(in0, _mm_loadu_si128((__m128i*)shuf0))); | |
_mm_storeu_si128((__m128i *)(dataPtr + len0), _mm_shuffle_epi8(in1, _mm_loadu_si128((__m128i*)shuf1))); | |
_mm_storeu_si128((__m128i *)(dataPtr + len0 + len1), _mm_shuffle_epi8(in2, _mm_loadu_si128((__m128i*)shuf2))); | |
_mm_storeu_si128((__m128i *)(dataPtr + len0 + len1 + len2), _mm_shuffle_epi8(in3, _mm_loadu_si128((__m128i*)shuf3))); | |
dataPtr += len0 + len1 + len2 + len3; | |
keyPtr[0] = (uint8_t)key0; | |
keyPtr[1] = (uint8_t)key1; | |
keyPtr[2] = (uint8_t)key2; | |
keyPtr[3] = (uint8_t)key3; | |
keyPtr += 4; | |
} | |
// todo: remaining dwords... | |
return dataPtr - out; | |
} | |
size_t svb_encode4(uint32_t *in, uint32_t count, uint8_t *out) { | |
uint8_t *restrict keyPtr = out; | |
uint32_t keyLen = (count + 3) >> 2; // 2-bits rounded to full byte | |
uint8_t *restrict dataPtr = keyPtr + keyLen; // variable lenght byte data after all keys | |
const __m128i Ones = _mm_set1_epi32(0x01010101); | |
const __m128i GatherBits = _mm_set1_epi32(0x02040001); | |
const __m128i CodeTable = _mm_set_epi32(0, 0, 0x03030303, 0x02020100); | |
const __m128i GatherBytes = _mm_set_epi32(0, 0, 0x0D090501, 0x0D090501); | |
const __m128i Aggregators = _mm_set_epi32(0, 0, 0x01010101, 0x10400104); | |
for(uint32_t* end = &in[(count & ~3)]; in != end; in += 4){ | |
__m128i in0 = _mm_loadu_si128((__m128i*)in); | |
__m128i m0, m1; | |
m0 = _mm_min_epu8(in0, Ones); // set byte to 1 if it is not zero | |
m0 = _mm_madd_epi16(m0, GatherBits); // gather bits 8,16,24 to bits 8,9,10 | |
m1 = _mm_shuffle_epi8(CodeTable, m0); // translate to a 2-bit encoded symbol | |
m1 = _mm_shuffle_epi8(m1, GatherBytes); // gather bytes holding symbols; 2 copies | |
m1 = _mm_madd_epi16(m1, Aggregators); // sum dword_1, pack dword_0 | |
size_t code = (size_t)_mm_extract_epi8(m1, 1); | |
size_t length = 4 + (size_t)_mm_extract_epi8(m1, 5); | |
__m128i* shuf = (__m128i*)(((uint8_t*)encodingShuffleTable) + code * 16); | |
__m128i out = _mm_shuffle_epi8(in0, _mm_loadu_si128(shuf)); // todo: aligned access | |
_mm_storeu_si128((__m128i *)dataPtr, out); | |
dataPtr += length; | |
*keyPtr = (uint8_t)code; | |
keyPtr++; | |
} | |
// todo: remaining dwords... | |
return dataPtr - out; | |
} | |
#define BUF_SIZE 0x800000 | |
int main () | |
{ | |
static char buf[BUF_SIZE]; | |
static char buf2[BUF_SIZE*2]; | |
static char out[BUF_SIZE*2]; | |
static char out2[BUF_SIZE*2]; | |
printf("gen...\n"); | |
srand(0); | |
for( int i = 0; i < BUF_SIZE; i++ ){ | |
uint16_t c = rand(); | |
buf[i] = (uint8_t)c; | |
if(c & 0x0300) buf[i] = 0; // add lots of zeros... | |
} | |
memset(out, 0, BUF_SIZE*2); | |
memset(out2, 0, BUF_SIZE*2); | |
size_t r1 = svb_encode4((uint32_t*)buf, BUF_SIZE >> 2, out); | |
size_t r2 = svb_encode16((uint32_t*)buf, BUF_SIZE >> 2, out2); | |
printf("%08X %08X -- ", r1, r2); | |
if((r1 == r2) && (0 == memcmp(out, out2, BUF_SIZE*2))){ | |
printf( "encode: ok\n"); | |
} else { | |
printf( "encode: FAIL!\n"); | |
} | |
{ | |
printf("%24s: ", "memcpy"); | |
uint64_t tm1, tm2; | |
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF; | |
for(int i = 0; i < 100; i++){ | |
RDTSC_START(tm1); | |
memcpy(out,buf,BUF_SIZE); | |
RDTSC_FINAL(tm2); | |
uint64_t tmus = tm2 - tm1; | |
if(tmus < min_diff) min_diff = tmus; | |
} | |
printf("%10" PRIu64 "\n", min_diff); | |
} | |
{ | |
printf("%24s: ", "encode16"); | |
uint64_t tm1, tm2; | |
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF; | |
for(int i = 0; i < 100; i++){ | |
RDTSC_START(tm1); | |
svb_encode16((uint32_t*)buf, BUF_SIZE >> 2, out); | |
RDTSC_FINAL(tm2); | |
uint64_t tmus = tm2 - tm1; | |
if(tmus < min_diff) min_diff = tmus; | |
} | |
printf("%10" PRIu64 "\n", min_diff); | |
} | |
{ | |
printf("%24s: ", "encode4"); | |
uint64_t tm1, tm2; | |
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF; | |
for(int i = 0; i < 100; i++){ | |
RDTSC_START(tm1); | |
svb_encode4((uint32_t*)buf, BUF_SIZE >> 2, out); | |
RDTSC_FINAL(tm2); | |
uint64_t tmus = tm2 - tm1; | |
if(tmus < min_diff) min_diff = tmus; | |
} | |
printf("%10" PRIu64 "\n", min_diff); | |
} | |
{ | |
printf("%24s: ", "encode16"); | |
uint64_t tm1, tm2; | |
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF; | |
for(int i = 0; i < 100; i++){ | |
RDTSC_START(tm1); | |
svb_encode16((uint32_t*)buf, BUF_SIZE >> 2, out); | |
RDTSC_FINAL(tm2); | |
uint64_t tmus = tm2 - tm1; | |
if(tmus < min_diff) min_diff = tmus; | |
} | |
printf("%10" PRIu64 "\n", min_diff); | |
} | |
{ | |
printf("%24s: ", "encode4"); | |
uint64_t tm1, tm2; | |
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF; | |
for(int i = 0; i < 100; i++){ | |
RDTSC_START(tm1); | |
svb_encode4((uint32_t*)buf, BUF_SIZE >> 2, out); | |
RDTSC_FINAL(tm2); | |
uint64_t tmus = tm2 - tm1; | |
if(tmus < min_diff) min_diff = tmus; | |
} | |
printf("%10" PRIu64 "\n", min_diff); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment