kayru/halftofloat.cpp

## halftofloat.cpp
// Yuriy O'Donnell <yuriyo@gmail.com>
// Released under MIT License (do whatever you want with it)

#define NOMINMAX
#define WIN32_LEAN_AND_MEAN

#include <stdio.h>
#include <windows.h>
#include <xnamath.h>
#include <malloc.h>
#include <float.h>
#include <algorithm>
#include <xmmintrin.h>
#include <emmintrin.h>

#define ALIGN16 __declspec(align(16))
typedef unsigned short uint16;
typedef unsigned int   uint32;

// XMConvertFloatToHalfStream -- 6.72 cycles per value
// D3DXFloat16To32Array -- 2.22 cycles per value
// both of the above functions handle 0 correctly


// ~4.63 cycles per value
// does not handle 0 correctly
inline float half_to_float(uint16 v)
{
	uint32 s,e,m,r;

	s = v & 0x8000;
	m = v & 0x03FF;
	e = v & 0x7C00;
	e += 0x0001C000;

	r = (s << 16) | (m<<13) | (e<<13);

	return *(float*)&r;
}

// ~1.68 cycles per value when __fastcall
// ~1.38 cycles per value when __forceinline
// does not handle 0 correctly
__forceinline void half_to_float_sse2_intrin_x8(const uint16* halfs, float* floats) // works on 8 at a time
{
	const __m128i mask_s8 = _mm_set1_epi16((short)0x8000);
	const __m128i mask_m8 = _mm_set1_epi16((short)0x03FF);
	const __m128i mask_e8 = _mm_set1_epi16((short)0x7C00);
	const __m128i bias_e4 = _mm_set1_epi32(0x0001C000);

	// exactly the same process as half_to_float()
	// partially 8 at a time, partially 4

	__m128i h8 = _mm_load_si128((__m128i*)halfs);

	// get sign, mantissa and exponent bits for all 8 halfs

	__m128i s8 = _mm_and_si128(h8, mask_s8);
	__m128i m8 = _mm_and_si128(h8, mask_m8);
	__m128i e8 = _mm_and_si128(h8, mask_e8);

	// first 4

	__m128i s4a = _mm_unpacklo_epi16(s8, _mm_setzero_si128());
	        s4a = _mm_slli_epi32 (s4a, 16);

	__m128i m4a = _mm_unpacklo_epi16(m8, _mm_setzero_si128());
	        m4a = _mm_slli_epi32 (m4a, 13);

	__m128i e4a = _mm_unpacklo_epi16(e8, _mm_setzero_si128());
	        e4a = _mm_add_epi32(bias_e4, e4a);
	        e4a = _mm_slli_epi32 (e4a, 13);

	__m128i f4a = _mm_or_si128(s4a, _mm_or_si128(e4a, m4a));

	_mm_store_si128((__m128i*)floats, f4a);

	// second 4

	__m128i s4b = _mm_unpackhi_epi16(s8, _mm_setzero_si128());
	        s4b = _mm_slli_epi32 (s4b, 16);

	__m128i m4b = _mm_unpackhi_epi16(m8, _mm_setzero_si128());
	        m4b = _mm_slli_epi32 (m4b, 13);

	__m128i e4b = _mm_unpackhi_epi16(e8, _mm_setzero_si128());
	        e4b = _mm_add_epi32(bias_e4, e4b);
	        e4b = _mm_slli_epi32 (e4b, 13);

	__m128i f4b = _mm_or_si128(s4b, _mm_or_si128(e4b, m4b));

	_mm_store_si128((__m128i*)(floats+4), f4b);
}

// ~1.89 cycles per value when __fastcall
// does not handle 0 correctly
void __fastcall half_to_float_sse2_asm_x8(const uint16* halfs, float* floats) // works on 8 at a time
{
	#define SPLAT_4(x) {x,x,x,x}
	#define SPLAT_8(x) {x,x,x,x,x,x,x,x}
	static __declspec(align(16)) uint16 mask_s8[8] = SPLAT_8(0x8000);
	static __declspec(align(16)) uint16 mask_m8[8] = SPLAT_8(0x03FF);
	static __declspec(align(16)) uint16 mask_e8[8] = SPLAT_8(0x7C00);
	static __declspec(align(16)) uint32 bias_e4[4] = SPLAT_4(0x0001C000);
	#undef SPLAT_4
	#undef SPLAT_8

	__asm
	{
		// load halfs into sse register
		movdqa xmm0, [ecx]
		pxor xmm7, xmm7

		// get sign, mantissa and exponent bits for all 8 halfs
		movdqa xmm1, xmm0
		movdqa xmm2, xmm0
		movdqa xmm3, xmm0

		movdqa xmm0, bias_e4

		pand xmm1, mask_s8
		pand xmm2, mask_m8
		pand xmm3, mask_e8

		// first 4 sign
		movdqa    xmm4, xmm1
		punpcklwd xmm4, xmm7
		pslld     xmm4, 16

		// first 4 mantissa
		movdqa    xmm5, xmm2
		punpcklwd xmm5, xmm7
		pslld     xmm5, 13

		// first 4 exponent
		movdqa    xmm6, xmm3
		punpcklwd xmm6, xmm7
		paddd     xmm6, xmm0
		pslld     xmm6, 13

		// first 4 pack
		por xmm6, xmm5
		por xmm6, xmm4

		movdqa [edx], xmm6

		// second 4 sign
		movdqa    xmm4, xmm1
		punpckhwd xmm4, xmm7
		pslld     xmm4, 16

		// second 4 mantissa
		movdqa    xmm5, xmm2
		punpckhwd xmm5, xmm7
		pslld     xmm5, 13

		// second 4 exponent
		movdqa    xmm6, xmm3
		punpckhwd xmm6, xmm7
		paddd     xmm6, xmm0
		pslld     xmm6, 13

		// second 4 pack
		por xmm6, xmm5
		por xmm6, xmm4

		movdqa [edx+16], xmm6
	}
}

int main()
{
	size_t count = 32768;

	// prepare data
	float*  input      = (float*)  _aligned_malloc(count * sizeof(float),  16);
	uint16* packed     = (uint16*) _aligned_malloc(count * sizeof(uint16), 16);
	float*  unpacked_a = (float*)  _aligned_malloc(count * sizeof(float),  16);
	float*  unpacked_b = (float*)  _aligned_malloc(count * sizeof(float),  16);

	for( size_t i=0; i<count; ++i )
	{
		input[i] = float(i) / 100.0f; // simple numbers to deal with
	}

	XMConvertFloatToHalfStream(packed, sizeof(packed[0]), input, sizeof(input[0]), count);

	float dt0 = FLT_MAX;
	float dt1 = FLT_MAX;

	for( size_t attempt=0; attempt<1000; ++attempt )
	{
		DWORD64 t0 = __rdtsc();

		for( size_t i=0; i<count; i+=8 )
		{
			half_to_float_sse2_asm_x8(packed+i, unpacked_a+i);
		}

		DWORD64 t1 = __rdtsc();

		for( size_t i=0; i<count; i+=8 )
		{
			half_to_float_sse2_intrin_x8(packed+i, unpacked_b+i);
		}

		DWORD64 t2 = __rdtsc();

		dt0 = std::min(dt0, float(t1-t0)/count);
		dt1 = std::min(dt1, float(t2-t1)/count);
	}

	printf("dt0: %f\n", dt0);
	printf("dt1: %f\n", dt1);

	_aligned_free(input);
	_aligned_free(packed);
	_aligned_free(unpacked_a);
	_aligned_free(unpacked_b);

	return 0;
}
	// Yuriy O'Donnell <yuriyo@gmail.com>
	// Released under MIT License (do whatever you want with it)

	#define NOMINMAX
	#define WIN32_LEAN_AND_MEAN

	#include <stdio.h>
	#include <windows.h>
	#include <xnamath.h>
	#include <malloc.h>
	#include <float.h>
	#include <algorithm>
	#include <xmmintrin.h>
	#include <emmintrin.h>

	#define ALIGN16 __declspec(align(16))
	typedef unsigned short uint16;
	typedef unsigned int uint32;

	// XMConvertFloatToHalfStream -- 6.72 cycles per value
	// D3DXFloat16To32Array -- 2.22 cycles per value
	// both of the above functions handle 0 correctly


	// ~4.63 cycles per value
	// does not handle 0 correctly
	inline float half_to_float(uint16 v)
	{
	uint32 s,e,m,r;

	s = v & 0x8000;
	m = v & 0x03FF;
	e = v & 0x7C00;
	e += 0x0001C000;

	r = (s << 16) \| (m<<13) \| (e<<13);

	return (float)&r;
	}

	// ~1.68 cycles per value when __fastcall
	// ~1.38 cycles per value when __forceinline
	// does not handle 0 correctly
	__forceinline void half_to_float_sse2_intrin_x8(const uint16* halfs, float* floats) // works on 8 at a time
	{
	const __m128i mask_s8 = _mm_set1_epi16((short)0x8000);
	const __m128i mask_m8 = _mm_set1_epi16((short)0x03FF);
	const __m128i mask_e8 = _mm_set1_epi16((short)0x7C00);
	const __m128i bias_e4 = _mm_set1_epi32(0x0001C000);

	// exactly the same process as half_to_float()
	// partially 8 at a time, partially 4

	__m128i h8 = _mm_load_si128((__m128i*)halfs);

	// get sign, mantissa and exponent bits for all 8 halfs

	__m128i s8 = _mm_and_si128(h8, mask_s8);
	__m128i m8 = _mm_and_si128(h8, mask_m8);
	__m128i e8 = _mm_and_si128(h8, mask_e8);

	// first 4

	__m128i s4a = _mm_unpacklo_epi16(s8, _mm_setzero_si128());
	s4a = _mm_slli_epi32 (s4a, 16);

	__m128i m4a = _mm_unpacklo_epi16(m8, _mm_setzero_si128());
	m4a = _mm_slli_epi32 (m4a, 13);

	__m128i e4a = _mm_unpacklo_epi16(e8, _mm_setzero_si128());
	e4a = _mm_add_epi32(bias_e4, e4a);
	e4a = _mm_slli_epi32 (e4a, 13);

	__m128i f4a = _mm_or_si128(s4a, _mm_or_si128(e4a, m4a));

	_mm_store_si128((__m128i*)floats, f4a);

	// second 4

	__m128i s4b = _mm_unpackhi_epi16(s8, _mm_setzero_si128());
	s4b = _mm_slli_epi32 (s4b, 16);

	__m128i m4b = _mm_unpackhi_epi16(m8, _mm_setzero_si128());
	m4b = _mm_slli_epi32 (m4b, 13);

	__m128i e4b = _mm_unpackhi_epi16(e8, _mm_setzero_si128());
	e4b = _mm_add_epi32(bias_e4, e4b);
	e4b = _mm_slli_epi32 (e4b, 13);

	__m128i f4b = _mm_or_si128(s4b, _mm_or_si128(e4b, m4b));

	_mm_store_si128((__m128i*)(floats+4), f4b);
	}

	// ~1.89 cycles per value when __fastcall
	// does not handle 0 correctly
	void __fastcall half_to_float_sse2_asm_x8(const uint16* halfs, float* floats) // works on 8 at a time
	{
	#define SPLAT_4(x) {x,x,x,x}
	#define SPLAT_8(x) {x,x,x,x,x,x,x,x}
	static __declspec(align(16)) uint16 mask_s8[8] = SPLAT_8(0x8000);
	static __declspec(align(16)) uint16 mask_m8[8] = SPLAT_8(0x03FF);
	static __declspec(align(16)) uint16 mask_e8[8] = SPLAT_8(0x7C00);
	static __declspec(align(16)) uint32 bias_e4[4] = SPLAT_4(0x0001C000);
	#undef SPLAT_4
	#undef SPLAT_8

	__asm
	{
	// load halfs into sse register
	movdqa xmm0, [ecx]
	pxor xmm7, xmm7

	// get sign, mantissa and exponent bits for all 8 halfs
	movdqa xmm1, xmm0
	movdqa xmm2, xmm0
	movdqa xmm3, xmm0

	movdqa xmm0, bias_e4

	pand xmm1, mask_s8
	pand xmm2, mask_m8
	pand xmm3, mask_e8

	// first 4 sign
	movdqa xmm4, xmm1
	punpcklwd xmm4, xmm7
	pslld xmm4, 16

	// first 4 mantissa
	movdqa xmm5, xmm2
	punpcklwd xmm5, xmm7
	pslld xmm5, 13

	// first 4 exponent
	movdqa xmm6, xmm3
	punpcklwd xmm6, xmm7
	paddd xmm6, xmm0
	pslld xmm6, 13

	// first 4 pack
	por xmm6, xmm5
	por xmm6, xmm4

	movdqa [edx], xmm6

	// second 4 sign
	movdqa xmm4, xmm1
	punpckhwd xmm4, xmm7
	pslld xmm4, 16

	// second 4 mantissa
	movdqa xmm5, xmm2
	punpckhwd xmm5, xmm7
	pslld xmm5, 13

	// second 4 exponent
	movdqa xmm6, xmm3
	punpckhwd xmm6, xmm7
	paddd xmm6, xmm0
	pslld xmm6, 13

	// second 4 pack
	por xmm6, xmm5
	por xmm6, xmm4

	movdqa [edx+16], xmm6
	}
	}

	int main()
	{
	size_t count = 32768;

	// prepare data
	float* input = (float) _aligned_malloc(count sizeof(float), 16);
	uint16* packed = (uint16) _aligned_malloc(count sizeof(uint16), 16);
	float* unpacked_a = (float) _aligned_malloc(count sizeof(float), 16);
	float* unpacked_b = (float) _aligned_malloc(count sizeof(float), 16);

	for( size_t i=0; i<count; ++i )
	{
	input[i] = float(i) / 100.0f; // simple numbers to deal with
	}

	XMConvertFloatToHalfStream(packed, sizeof(packed[0]), input, sizeof(input[0]), count);

	float dt0 = FLT_MAX;
	float dt1 = FLT_MAX;

	for( size_t attempt=0; attempt<1000; ++attempt )
	{
	DWORD64 t0 = __rdtsc();

	for( size_t i=0; i<count; i+=8 )
	{
	half_to_float_sse2_asm_x8(packed+i, unpacked_a+i);
	}

	DWORD64 t1 = __rdtsc();

	for( size_t i=0; i<count; i+=8 )
	{
	half_to_float_sse2_intrin_x8(packed+i, unpacked_b+i);
	}

	DWORD64 t2 = __rdtsc();

	dt0 = std::min(dt0, float(t1-t0)/count);
	dt1 = std::min(dt1, float(t2-t1)/count);
	}

	printf("dt0: %f\n", dt0);
	printf("dt1: %f\n", dt1);

	_aligned_free(input);
	_aligned_free(packed);
	_aligned_free(unpacked_a);
	_aligned_free(unpacked_b);

	return 0;
	}