Skip to content

Instantly share code, notes, and snippets.

Created June 19, 2021 18:21
Show Gist options
  • Save rygorous/e4991ed243a3c7ffa58ab0d74c266baa to your computer and use it in GitHub Desktop.
Save rygorous/e4991ed243a3c7ffa58ab0d74c266baa to your computer and use it in GitHub Desktop.
Bytewise remapping
#include <stdint.h>
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef unsigned long long U64;
typedef intptr_t SINTa;
struct KernelState
U8 *output;
const U8 *input;
const U8 *lut;
SINTa count;
// Simple reference version
static bool remap_ref(KernelState *s)
U8 * __restrict outp = s->output;
const U8 *inp = s->input;
const U8 *lut = s->lut;
SINTa count = s->count;
for (SINTa i = 0; i < count; i++)
outp[i] = lut[inp[i]];
return true;
static bool remap_avx2(KernelState *s)
// Set up remapping table
__m128i cur0 = _mm_setzero_si128();
__m128i cur8 = _mm_setzero_si128();
__m128i remap_tab[16];
for (int i = 0; i < 8; i++)
__m128i b0 = _mm_loadu_si128((const __m128i *) (s->lut + 0 + i*16));
__m128i b8 = _mm_loadu_si128((const __m128i *) (s->lut + 128 + i*16));
remap_tab[i + 0] = _mm_xor_si128(cur0, b0);
remap_tab[i + 8] = _mm_xor_si128(cur8, b8);
cur0 = b0;
cur8 = b8;
// Perform the remap
U8 * __restrict outp = s->output;
const U8 *inp = s->input;
SINTa count = s->count;
// NOTE: doesn't have tail handling yet
for (SINTa i = 0; i < count; i += 32)
__m256i inds0 = _mm256_loadu_si256((const __m256i *) (inp + i));
__m256i inds8 = _mm256_xor_si256(inds0, _mm256_set1_epi8(-0x80));
// First pass
__m256i out0 = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(remap_tab[0]), inds0);
__m256i out8 = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(remap_tab[8]), inds8);
#define PASS(n) \
inds0 = _mm256_subs_epi8(inds0, _mm256_set1_epi8(0x10)); \
inds8 = _mm256_subs_epi8(inds8, _mm256_set1_epi8(0x10)); \
out0 = _mm256_xor_si256(out0, _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(remap_tab[n+0]), inds0)); \
out8 = _mm256_xor_si256(out8, _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(remap_tab[n+8]), inds8))
// Remaining passes
#undef PASS
__m256i result = _mm256_or_si256(out0, out8);
_mm256_storeu_si256((__m256i *) (outp + i), result);
return true;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment