Skip to content

Instantly share code, notes, and snippets.

@BrettRToomey
Last active June 24, 2022 21:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BrettRToomey/dda02ca7c46bd889bb3760c8e9e2f9ac to your computer and use it in GitHub Desktop.
Save BrettRToomey/dda02ca7c46bd889bb3760c8e9e2f9ac to your computer and use it in GitHub Desktop.
An example showing how to trim an array of integers using a boundary in SIMD
// Credit for this algorithm goes to @pervognsen.
// I just took his pseudocode and turned it into C
//
// NOTE: this code is not "production safe" and makes assumptions about
// the input (like 'count' being divisible by 4).
#include <tmmintrin.h>
#include <emmintrin.h>
#include <popcntintrin.h>
#define SLOT(n) (n*4+0), (n*4+1), (n*4+2), (n*4+3)
#define _ 128, 128, 128, 128
// this table maps a 16-bit 'movmskps' mask into a 'pshufb' mask
char sse_compact_lut[16][16] = {
/* 0b0000 */ { _, _, _, _ },
/* 0b0001 */ { SLOT(0), _, _, _ },
/* 0b0010 */ { SLOT(1), _, _, _ },
/* 0b0011 */ { SLOT(0), SLOT(1), _, _ },
/* 0b0100 */ { SLOT(2), _, _, _ },
/* 0b0101 */ { SLOT(0), SLOT(2), _, _ },
/* 0b0110 */ { SLOT(1), SLOT(2), _, _ },
/* 0b0111 */ { SLOT(0), SLOT(1), SLOT(2), _ },
/* 0b1000 */ { SLOT(3), _, _, _ },
/* 0b1001 */ { SLOT(0), SLOT(3), _, _ },
/* 0b1010 */ { SLOT(1), SLOT(3), _, _ },
/* 0b1011 */ { SLOT(0), SLOT(1), SLOT(3), _ },
/* 0b1100 */ { SLOT(2), SLOT(3), _, _ },
/* 0b1101 */ { SLOT(0), SLOT(2), SLOT(3), _ },
/* 0b1110 */ { SLOT(1), SLOT(2), SLOT(3), _ },
/* 0b1111 */ { SLOT(0), SLOT(1), SLOT(2), SLOT(3) },
};
#undef SLOT
#undef _
void trim_using_lower_bound_sse( int *src, int count, int *dst, int bound )
{
int *end = src + count;
__m128i b = _mm_set1_epi32(bound);
__m128i *lut = (__m128i *) sse_compact_lut;
while (src < end) {
__m128i nums = _mm_loadu_si128((__m128i_u *) src);
int mask = _mm_movemask_ps(_mm_cmplt_epi32(nums, b));
__m128i res = _mm_shuffle_epi8(nums, lut[mask]);
_mm_storeu_si128((__m128i_u *) dst, res);
dst += _mm_popcnt_u32(mask);
src += 4;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment