Skip to content

Instantly share code, notes, and snippets.

@rygorous
Last active July 22, 2023 03:41
Show Gist options
  • Save rygorous/c95069ed169993e5c26f to your computer and use it in GitHub Desktop.
Save rygorous/c95069ed169993e5c26f to your computer and use it in GitHub Desktop.
U32->F32 using SSE2 intrinsics.
// ---- Straightforward:
__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in); // low 16 bits of all vals
__m128i hi_int = _mm_srli_epi32(in, 16); // high 16 bits of all vals
__m128 lo_flt = _mm_cvtepi32_ps(lo_int); // exact (all 16 bit ints = machine numbers)
__m128 hi_flt = _mm_cvtepi32_ps(hi_int); // exact
__m128 hi_scl = _mm_mul_ps(hi_flt, _mm_set1_ps(65536.0f)); // exact (just exponent change)
__m128 result = _mm_add_ps(hi_scl, lo_flt); // this is the only step that rounds.
// same approach also works with FMA where available.
// Alternative spin on same idea: do conversion using IEEE magic values.
// Replaces hi_flt calculation with an "or", hi_scl calculation with a "sub".
// Rest is the same. Not sure which is faster.
// ---- Magic constant version 1:
static const float magic_scaled16 = 549755813888.0f; // 2**(16 + 23)
__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in);
__m128i hi_int = _mm_srli_epi32(in, 16);
__m128 lo_flt = _mm_cvtepi32_ps(lo_int);
__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), _mm_set1_ps(magic_scaled16));
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16));
__m128 result = _mm_add_ps(hi_scl, lo_flt);
// ---- And if you want to get really tricky, you can convert *both* ints to floats
// via magic constants. This is cheaper than one would expect because the two bias
// subtractions can be folded into one and still be exact for this problem:
static const float magic_scaled16 = 549755813888.0f; // 2**(16 + 23)
static const float magic_unscaled = 8388608.0f; // 2**23
__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in);
__m128i hi_int = _mm_srli_epi32(in, 16);
__m128 lo_flt = _mm_or_ps(_mm_castsi128_ps(lo_int), _mm_set1_ps(magic_unscaled));
__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), _mm_set1_ps(magic_scaled16));
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
__m128 result = _mm_add_ps(hi_scl, lo_flt);
// Needs 4 constants, though.
// ---- One option that works everywhere and reduces the number of constant loads
// to two is using shuffles:
static const uint32_t pack_magic = ((127 + (16 + 23)) << 23) | ((127 + ( 0 + 23)) << 7));
__m128 ileave_hi = _mm_castsi128_ps(_mm_unpackhi_epi16(in, _mm_set1_epi32(pack_magic)));
__m128 ileave_lo = _mm_castsi128_ps(_mm_unpacklo_epi16(in, _mm_set1_epi32(pack_magic)));
__m128 hi_flt = _mm_shuffle_ps(ileave_lo, ileave_hi, 0xdd);
__m128 lo_flt = _mm_shuffle_ps(ileave_lo, ileave_hi, 0x88);
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
__m128 result = _mm_add_ps(hi_scl, lo_flt);
// Makes it very shuffle-heavy though. Not a win in a tight loop that batch-converts,
// but when you have a bunch of computation following it might be the right choice.
// ---- With SSE 4.1, you can reduce the constants to three without any penalties,
// by reducing the AND/OR for "lo_flt" to a single PBLENDW:
__m128 lo_flt = _mm_castsi128_ps(_mm_blend_epi16(in, _mm_set1_epi32((127 + 23) << 23), 0xaa));
__m128i hi_int = _mm_srli_epi32(in, 16);
__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), magic_scaled16);
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
__m128 result = _mm_add_ps(hi_scl, lo_flt);
// ---- You can also reduce the constants to two by using the same trick again for
// the high half, but this time it doesn't reduce the op count and is thus not as
// attractive (it should also be slightly slower in isolation).
__m128i const blend_magic = _mm_set1_epi32(((127 + ( 0 + 23)) << 23) | ((127 + (16 + 23)) << 7)));
__m128i hi_tmp = _mm_blend_epi16(in, blend_magic, 0x55);
__m128 lo_flt = _mm_castsi128_ps(_mm_blend_epi16(in, blend_magic, 0xaa));
__m128 hi_flt = _mm_castsi128_ps(_mm_alignr_epi8(hi_tmp, hi_tmp, 2));
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
__m128 result = _mm_add_ps(hi_scl, lo_flt);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment