Skip to content

Instantly share code, notes, and snippets.

@kayru
Created October 18, 2011 23:51
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save kayru/1297108 to your computer and use it in GitHub Desktop.
Save kayru/1297108 to your computer and use it in GitHub Desktop.
SSE2 Half to Float
// Yuriy O'Donnell <yuriyo@gmail.com>
// Released under MIT License (do whatever you want with it)
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#include <stdio.h>
#include <windows.h>
#include <xnamath.h>
#include <malloc.h>
#include <float.h>
#include <algorithm>
#include <xmmintrin.h>
#include <emmintrin.h>
#define ALIGN16 __declspec(align(16))
typedef unsigned short uint16;
typedef unsigned int uint32;
// XMConvertFloatToHalfStream -- 6.72 cycles per value
// D3DXFloat16To32Array -- 2.22 cycles per value
// both of the above functions handle 0 correctly
// ~4.63 cycles per value
// does not handle 0 correctly
inline float half_to_float(uint16 v)
{
uint32 s,e,m,r;
s = v & 0x8000;
m = v & 0x03FF;
e = v & 0x7C00;
e += 0x0001C000;
r = (s << 16) | (m<<13) | (e<<13);
return *(float*)&r;
}
// ~1.68 cycles per value when __fastcall
// ~1.38 cycles per value when __forceinline
// does not handle 0 correctly
__forceinline void half_to_float_sse2_intrin_x8(const uint16* halfs, float* floats) // works on 8 at a time
{
const __m128i mask_s8 = _mm_set1_epi16((short)0x8000);
const __m128i mask_m8 = _mm_set1_epi16((short)0x03FF);
const __m128i mask_e8 = _mm_set1_epi16((short)0x7C00);
const __m128i bias_e4 = _mm_set1_epi32(0x0001C000);
// exactly the same process as half_to_float()
// partially 8 at a time, partially 4
__m128i h8 = _mm_load_si128((__m128i*)halfs);
// get sign, mantissa and exponent bits for all 8 halfs
__m128i s8 = _mm_and_si128(h8, mask_s8);
__m128i m8 = _mm_and_si128(h8, mask_m8);
__m128i e8 = _mm_and_si128(h8, mask_e8);
// first 4
__m128i s4a = _mm_unpacklo_epi16(s8, _mm_setzero_si128());
s4a = _mm_slli_epi32 (s4a, 16);
__m128i m4a = _mm_unpacklo_epi16(m8, _mm_setzero_si128());
m4a = _mm_slli_epi32 (m4a, 13);
__m128i e4a = _mm_unpacklo_epi16(e8, _mm_setzero_si128());
e4a = _mm_add_epi32(bias_e4, e4a);
e4a = _mm_slli_epi32 (e4a, 13);
__m128i f4a = _mm_or_si128(s4a, _mm_or_si128(e4a, m4a));
_mm_store_si128((__m128i*)floats, f4a);
// second 4
__m128i s4b = _mm_unpackhi_epi16(s8, _mm_setzero_si128());
s4b = _mm_slli_epi32 (s4b, 16);
__m128i m4b = _mm_unpackhi_epi16(m8, _mm_setzero_si128());
m4b = _mm_slli_epi32 (m4b, 13);
__m128i e4b = _mm_unpackhi_epi16(e8, _mm_setzero_si128());
e4b = _mm_add_epi32(bias_e4, e4b);
e4b = _mm_slli_epi32 (e4b, 13);
__m128i f4b = _mm_or_si128(s4b, _mm_or_si128(e4b, m4b));
_mm_store_si128((__m128i*)(floats+4), f4b);
}
// ~1.89 cycles per value when __fastcall
// does not handle 0 correctly
void __fastcall half_to_float_sse2_asm_x8(const uint16* halfs, float* floats) // works on 8 at a time
{
#define SPLAT_4(x) {x,x,x,x}
#define SPLAT_8(x) {x,x,x,x,x,x,x,x}
static __declspec(align(16)) uint16 mask_s8[8] = SPLAT_8(0x8000);
static __declspec(align(16)) uint16 mask_m8[8] = SPLAT_8(0x03FF);
static __declspec(align(16)) uint16 mask_e8[8] = SPLAT_8(0x7C00);
static __declspec(align(16)) uint32 bias_e4[4] = SPLAT_4(0x0001C000);
#undef SPLAT_4
#undef SPLAT_8
__asm
{
// load halfs into sse register
movdqa xmm0, [ecx]
pxor xmm7, xmm7
// get sign, mantissa and exponent bits for all 8 halfs
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
movdqa xmm0, bias_e4
pand xmm1, mask_s8
pand xmm2, mask_m8
pand xmm3, mask_e8
// first 4 sign
movdqa xmm4, xmm1
punpcklwd xmm4, xmm7
pslld xmm4, 16
// first 4 mantissa
movdqa xmm5, xmm2
punpcklwd xmm5, xmm7
pslld xmm5, 13
// first 4 exponent
movdqa xmm6, xmm3
punpcklwd xmm6, xmm7
paddd xmm6, xmm0
pslld xmm6, 13
// first 4 pack
por xmm6, xmm5
por xmm6, xmm4
movdqa [edx], xmm6
// second 4 sign
movdqa xmm4, xmm1
punpckhwd xmm4, xmm7
pslld xmm4, 16
// second 4 mantissa
movdqa xmm5, xmm2
punpckhwd xmm5, xmm7
pslld xmm5, 13
// second 4 exponent
movdqa xmm6, xmm3
punpckhwd xmm6, xmm7
paddd xmm6, xmm0
pslld xmm6, 13
// second 4 pack
por xmm6, xmm5
por xmm6, xmm4
movdqa [edx+16], xmm6
}
}
int main()
{
size_t count = 32768;
// prepare data
float* input = (float*) _aligned_malloc(count * sizeof(float), 16);
uint16* packed = (uint16*) _aligned_malloc(count * sizeof(uint16), 16);
float* unpacked_a = (float*) _aligned_malloc(count * sizeof(float), 16);
float* unpacked_b = (float*) _aligned_malloc(count * sizeof(float), 16);
for( size_t i=0; i<count; ++i )
{
input[i] = float(i) / 100.0f; // simple numbers to deal with
}
XMConvertFloatToHalfStream(packed, sizeof(packed[0]), input, sizeof(input[0]), count);
float dt0 = FLT_MAX;
float dt1 = FLT_MAX;
for( size_t attempt=0; attempt<1000; ++attempt )
{
DWORD64 t0 = __rdtsc();
for( size_t i=0; i<count; i+=8 )
{
half_to_float_sse2_asm_x8(packed+i, unpacked_a+i);
}
DWORD64 t1 = __rdtsc();
for( size_t i=0; i<count; i+=8 )
{
half_to_float_sse2_intrin_x8(packed+i, unpacked_b+i);
}
DWORD64 t2 = __rdtsc();
dt0 = std::min(dt0, float(t1-t0)/count);
dt1 = std::min(dt1, float(t2-t1)/count);
}
printf("dt0: %f\n", dt0);
printf("dt1: %f\n", dt1);
_aligned_free(input);
_aligned_free(packed);
_aligned_free(unpacked_a);
_aligned_free(unpacked_b);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment