Skip to content

Instantly share code, notes, and snippets.

@aqrit

aqrit/svb_test.c Secret

Last active February 20, 2018 21:57
Show Gist options
  • Save aqrit/8fb615a05586d023e07cbd997cfcb6f9 to your computer and use it in GitHub Desktop.
Save aqrit/8fb615a05586d023e07cbd997cfcb6f9 to your computer and use it in GitHub Desktop.
streamvbyte unrolled encoder trial
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <inttypes.h> // PRIu64
#include <smmintrin.h> // sse41
#include "streamvbyte_shuffle_tables.h"
#define RDTSC_START(cycles) \
do{ \
register unsigned cyc_high, cyc_low; \
__asm volatile("cpuid\n\t" \
"rdtsc\n\t" \
"cpuid\n\t" \
"rdtsc\n\t" \
"cpuid\n\t" \
"rdtsc\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
: "=r"(cyc_high), "=r"(cyc_low) \
::"%rax", "%rbx", "%rcx", "%rdx", "memory"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
}while(0)
#define RDTSC_FINAL(cycles) \
do{ \
register unsigned cyc_high, cyc_low; \
__asm volatile("rdtscp\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
"cpuid\n\t" \
: "=r"(cyc_high), "=r"(cyc_low) \
::"%rax", "%rbx", "%rcx", "%rdx", "memory"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
}while(0)
size_t svb_encode16(uint32_t *in, uint32_t count, uint8_t *out) {
uint8_t *restrict keyPtr = out;
uint32_t keyLen = (count + 3) >> 2; // 2-bits rounded to full byte
uint8_t *restrict dataPtr = keyPtr + keyLen; // variable lenght byte data after all keys
const __m128i zero = _mm_setzero_si128();
const __m128i hash = _mm_set1_epi32(0x7F7F7F7F);
const __m128i translate = _mm_set_epi32(0x01000000,0,0,0x0203);
const __m128i pack = _mm_set1_epi32(0x10400104);
const __m128i hsum = _mm_set1_epi32(0x01010101);
for(uint32_t* end = &in[(count & ~15)]; in != end; in += 16){
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in1 = _mm_loadu_si128((__m128i*)&in[4]);
__m128i in2 = _mm_loadu_si128((__m128i*)&in[8]);
__m128i in3 = _mm_loadu_si128((__m128i*)&in[12]);
__m128i t0 = _mm_cmpeq_epi8(zero, in0);
__m128i t1 = _mm_cmpeq_epi8(zero, in1);
__m128i t2 = _mm_cmpeq_epi8(zero, in2);
__m128i t3 = _mm_cmpeq_epi8(zero, in3);
t0 = _mm_packs_epi16(t0, t1);
t2 = _mm_packs_epi16(t2, t3);
t0 = _mm_srai_epi16(t0, 7);
t2 = _mm_srai_epi16(t2, 7);
t0 = _mm_packs_epi16(t0, t2);
t0 = _mm_subs_epu8(t0, hash);
t1 = _mm_shuffle_epi8(translate, t0);
t2 = _mm_madd_epi16(t1, pack);
t1 = _mm_madd_epi16(t1, hsum);
const size_t key0 = (size_t)_mm_extract_epi8(t2, 1);
const size_t key1 = (size_t)_mm_extract_epi8(t2, 5);
const size_t key2 = (size_t)_mm_extract_epi8(t2, 9);
const size_t key3 = (size_t)_mm_extract_epi8(t2, 13);
const uint8_t* tbl = (uint8_t*)encodingShuffleTable;
const uint8_t* shuf0 = tbl + key0 * 16;
const uint8_t* shuf1 = tbl + key1 * 16;
const uint8_t* shuf2 = tbl + key2 * 16;
const uint8_t* shuf3 = tbl + key3 * 16;
const size_t len0 = 4 + (size_t)_mm_extract_epi8(t1, 1);
const size_t len1 = 4 + (size_t)_mm_extract_epi8(t1, 5);
const size_t len2 = 4 + (size_t)_mm_extract_epi8(t1, 9);
const size_t len3 = 4 + (size_t)_mm_extract_epi8(t1, 13);
_mm_storeu_si128((__m128i *)dataPtr, _mm_shuffle_epi8(in0, _mm_loadu_si128((__m128i*)shuf0)));
_mm_storeu_si128((__m128i *)(dataPtr + len0), _mm_shuffle_epi8(in1, _mm_loadu_si128((__m128i*)shuf1)));
_mm_storeu_si128((__m128i *)(dataPtr + len0 + len1), _mm_shuffle_epi8(in2, _mm_loadu_si128((__m128i*)shuf2)));
_mm_storeu_si128((__m128i *)(dataPtr + len0 + len1 + len2), _mm_shuffle_epi8(in3, _mm_loadu_si128((__m128i*)shuf3)));
dataPtr += len0 + len1 + len2 + len3;
keyPtr[0] = (uint8_t)key0;
keyPtr[1] = (uint8_t)key1;
keyPtr[2] = (uint8_t)key2;
keyPtr[3] = (uint8_t)key3;
keyPtr += 4;
}
// todo: remaining dwords...
return dataPtr - out;
}
size_t svb_encode4(uint32_t *in, uint32_t count, uint8_t *out) {
uint8_t *restrict keyPtr = out;
uint32_t keyLen = (count + 3) >> 2; // 2-bits rounded to full byte
uint8_t *restrict dataPtr = keyPtr + keyLen; // variable lenght byte data after all keys
const __m128i Ones = _mm_set1_epi32(0x01010101);
const __m128i GatherBits = _mm_set1_epi32(0x02040001);
const __m128i CodeTable = _mm_set_epi32(0, 0, 0x03030303, 0x02020100);
const __m128i GatherBytes = _mm_set_epi32(0, 0, 0x0D090501, 0x0D090501);
const __m128i Aggregators = _mm_set_epi32(0, 0, 0x01010101, 0x10400104);
for(uint32_t* end = &in[(count & ~3)]; in != end; in += 4){
__m128i in0 = _mm_loadu_si128((__m128i*)in);
__m128i m0, m1;
m0 = _mm_min_epu8(in0, Ones); // set byte to 1 if it is not zero
m0 = _mm_madd_epi16(m0, GatherBits); // gather bits 8,16,24 to bits 8,9,10
m1 = _mm_shuffle_epi8(CodeTable, m0); // translate to a 2-bit encoded symbol
m1 = _mm_shuffle_epi8(m1, GatherBytes); // gather bytes holding symbols; 2 copies
m1 = _mm_madd_epi16(m1, Aggregators); // sum dword_1, pack dword_0
size_t code = (size_t)_mm_extract_epi8(m1, 1);
size_t length = 4 + (size_t)_mm_extract_epi8(m1, 5);
__m128i* shuf = (__m128i*)(((uint8_t*)encodingShuffleTable) + code * 16);
__m128i out = _mm_shuffle_epi8(in0, _mm_loadu_si128(shuf)); // todo: aligned access
_mm_storeu_si128((__m128i *)dataPtr, out);
dataPtr += length;
*keyPtr = (uint8_t)code;
keyPtr++;
}
// todo: remaining dwords...
return dataPtr - out;
}
#define BUF_SIZE 0x800000
int main ()
{
static char buf[BUF_SIZE];
static char buf2[BUF_SIZE*2];
static char out[BUF_SIZE*2];
static char out2[BUF_SIZE*2];
printf("gen...\n");
srand(0);
for( int i = 0; i < BUF_SIZE; i++ ){
uint16_t c = rand();
buf[i] = (uint8_t)c;
if(c & 0x0300) buf[i] = 0; // add lots of zeros...
}
memset(out, 0, BUF_SIZE*2);
memset(out2, 0, BUF_SIZE*2);
size_t r1 = svb_encode4((uint32_t*)buf, BUF_SIZE >> 2, out);
size_t r2 = svb_encode16((uint32_t*)buf, BUF_SIZE >> 2, out2);
printf("%08X %08X -- ", r1, r2);
if((r1 == r2) && (0 == memcmp(out, out2, BUF_SIZE*2))){
printf( "encode: ok\n");
} else {
printf( "encode: FAIL!\n");
}
{
printf("%24s: ", "memcpy");
uint64_t tm1, tm2;
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF;
for(int i = 0; i < 100; i++){
RDTSC_START(tm1);
memcpy(out,buf,BUF_SIZE);
RDTSC_FINAL(tm2);
uint64_t tmus = tm2 - tm1;
if(tmus < min_diff) min_diff = tmus;
}
printf("%10" PRIu64 "\n", min_diff);
}
{
printf("%24s: ", "encode16");
uint64_t tm1, tm2;
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF;
for(int i = 0; i < 100; i++){
RDTSC_START(tm1);
svb_encode16((uint32_t*)buf, BUF_SIZE >> 2, out);
RDTSC_FINAL(tm2);
uint64_t tmus = tm2 - tm1;
if(tmus < min_diff) min_diff = tmus;
}
printf("%10" PRIu64 "\n", min_diff);
}
{
printf("%24s: ", "encode4");
uint64_t tm1, tm2;
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF;
for(int i = 0; i < 100; i++){
RDTSC_START(tm1);
svb_encode4((uint32_t*)buf, BUF_SIZE >> 2, out);
RDTSC_FINAL(tm2);
uint64_t tmus = tm2 - tm1;
if(tmus < min_diff) min_diff = tmus;
}
printf("%10" PRIu64 "\n", min_diff);
}
{
printf("%24s: ", "encode16");
uint64_t tm1, tm2;
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF;
for(int i = 0; i < 100; i++){
RDTSC_START(tm1);
svb_encode16((uint32_t*)buf, BUF_SIZE >> 2, out);
RDTSC_FINAL(tm2);
uint64_t tmus = tm2 - tm1;
if(tmus < min_diff) min_diff = tmus;
}
printf("%10" PRIu64 "\n", min_diff);
}
{
printf("%24s: ", "encode4");
uint64_t tm1, tm2;
uint64_t min_diff = 0xFFFFFFFFFFFFFFFF;
for(int i = 0; i < 100; i++){
RDTSC_START(tm1);
svb_encode4((uint32_t*)buf, BUF_SIZE >> 2, out);
RDTSC_FINAL(tm2);
uint64_t tmus = tm2 - tm1;
if(tmus < min_diff) min_diff = tmus;
}
printf("%10" PRIu64 "\n", min_diff);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment