rossy/float16.c

## float16.c
// gcc -O2 -Wall -std=c99 -mf16c float16.c -o float16

#include <stdbool.h>
#include <inttypes.h>
#include <immintrin.h>
#include <stdio.h>
#include <math.h>

static char *format_float32_buf(float f, char buf[static 35])
{
	union {
		float f;
		uint32_t i;
	} u = { .f = f };

	char *p = buf;
	for (int i = 0; i < 32; i++) {
		*p++ = ((u.i >> (31 - i)) & 1) ? '1' : '0';
		if (i == 0 || i == 8)
			*p++ = ' ';
	}
	*p++ = '\0';
	return buf;
}
#define format_float32(f) format_float32_buf(f, (char[35]) { 0 })

static char *format_float16_buf(uint16_t f, char buf[static 19])
{
	char *p = buf;
	for (int i = 0; i < 16; i++) {
		*p++ = ((f >> (15 - i)) & 1) ? '1' : '0';
		if (i == 0 || i == 5)
			*p++ = ' ';
	}
	*p++ = '\0';
	return buf;
}
#define format_float16(f) format_float16_buf(f, (char[19]) { 0 })

static uint16_t float32_to_16_hw(float f)
{
	return _cvtss_sh(f, 0);
}

static uint16_t float32_to_16_sw(float f)
{
	uint16_t sign = signbit(f) ? 0x8000 : 0x0000;

	switch (fpclassify(f)) {
	case FP_INFINITE:
		// Handle +/-Inf as a special case
		return 0x7c00 | sign;
	case FP_NAN:
		// Return a NaN with the non-signalling bit set
		return 0x7e00 | sign;
	case FP_SUBNORMAL:
	case FP_ZERO:
		// Zeros need to be special-cased too. Handle subnormals as well, since
		// float32 subnormals are so small that they will always round to zero.
		return 0x0000 | sign;
	}

	// Extract the exponent and a normalized mantissa, with value [0.5; 1)
	int exp;
	float norm_mant = fabsf(frexpf(f, &exp));

	// The normalized mantissa will be multiplied by 2^mant_exp. In a normal
	// float, this will be 2^11, for 10 bits of mantissa and the implied
	// leading '1' bit. In a subnormal float, mant_exp will take a lower value.
	int mant_exp = 11;

	// Handle subnormals
	if (exp < -13) {
		mant_exp = exp + 24;
		exp = -13;
	}

	// If the exponent is this high, clamp to +/-Inf to prevent overflow
	if (exp > 16)
		return 0x7c00 | sign;

	// Get the final mantissa. This uses the current float rounding mode.
	uint16_t mant = lrintf(ldexpf(norm_mant, mant_exp));

	// Note: exp_biased is lower than the final exponent for a float16, since
	// we rely on the mantissa to overflow into the exponent to handle
	// rounding. For normal floats, the implied leading '1' bit will also
	// overflow into the exponent, so the real exponent bias is 14.
	uint16_t exp_biased = exp + 13;
	return (mant + (exp_biased << 10)) | sign;
}

static bool float16_isnan(uint16_t f)
{
	return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0x0000;
}

int main(int argc, char **argv)
{
	union {
		float f;
		uint32_t i;
	} u = { .i = 0 };

	int errors = 0;

	do {
		uint16_t hf_hw = float32_to_16_hw(u.f);
		uint16_t hf_sw = float32_to_16_sw(u.f);

		// Don't try to replicate the exact NaN bit patterns
		if (float16_isnan(hf_hw) && float16_isnan(hf_sw))
			continue;
		if (hf_hw == hf_sw)
			continue;

		printf("hw: %s -> %s\n", format_float32(u.f), format_float16(hf_hw));
		printf("sw: %s -> %s\n", format_float32(u.f), format_float16(hf_sw));
		if (++errors == 10)
			return 1;
	} while (++u.i);
}
	// gcc -O2 -Wall -std=c99 -mf16c float16.c -o float16

	#include <stdbool.h>
	#include <inttypes.h>
	#include <immintrin.h>
	#include <stdio.h>
	#include <math.h>

	static char *format_float32_buf(float f, char buf[static 35])
	{
	union {
	float f;
	uint32_t i;
	} u = { .f = f };

	char *p = buf;
	for (int i = 0; i < 32; i++) {
	*p++ = ((u.i >> (31 - i)) & 1) ? '1' : '0';
	if (i == 0 \|\| i == 8)
	*p++ = ' ';
	}
	*p++ = '\0';
	return buf;
	}
	#define format_float32(f) format_float32_buf(f, (char[35]) { 0 })

	static char *format_float16_buf(uint16_t f, char buf[static 19])
	{
	char *p = buf;
	for (int i = 0; i < 16; i++) {
	*p++ = ((f >> (15 - i)) & 1) ? '1' : '0';
	if (i == 0 \|\| i == 5)
	*p++ = ' ';
	}
	*p++ = '\0';
	return buf;
	}
	#define format_float16(f) format_float16_buf(f, (char[19]) { 0 })

	static uint16_t float32_to_16_hw(float f)
	{
	return _cvtss_sh(f, 0);
	}

	static uint16_t float32_to_16_sw(float f)
	{
	uint16_t sign = signbit(f) ? 0x8000 : 0x0000;

	switch (fpclassify(f)) {
	case FP_INFINITE:
	// Handle +/-Inf as a special case
	return 0x7c00 \| sign;
	case FP_NAN:
	// Return a NaN with the non-signalling bit set
	return 0x7e00 \| sign;
	case FP_SUBNORMAL:
	case FP_ZERO:
	// Zeros need to be special-cased too. Handle subnormals as well, since
	// float32 subnormals are so small that they will always round to zero.
	return 0x0000 \| sign;
	}

	// Extract the exponent and a normalized mantissa, with value [0.5; 1)
	int exp;
	float norm_mant = fabsf(frexpf(f, &exp));

	// The normalized mantissa will be multiplied by 2^mant_exp. In a normal
	// float, this will be 2^11, for 10 bits of mantissa and the implied
	// leading '1' bit. In a subnormal float, mant_exp will take a lower value.
	int mant_exp = 11;

	// Handle subnormals
	if (exp < -13) {
	mant_exp = exp + 24;
	exp = -13;
	}

	// If the exponent is this high, clamp to +/-Inf to prevent overflow
	if (exp > 16)
	return 0x7c00 \| sign;

	// Get the final mantissa. This uses the current float rounding mode.
	uint16_t mant = lrintf(ldexpf(norm_mant, mant_exp));

	// Note: exp_biased is lower than the final exponent for a float16, since
	// we rely on the mantissa to overflow into the exponent to handle
	// rounding. For normal floats, the implied leading '1' bit will also
	// overflow into the exponent, so the real exponent bias is 14.
	uint16_t exp_biased = exp + 13;
	return (mant + (exp_biased << 10)) \| sign;
	}

	static bool float16_isnan(uint16_t f)
	{
	return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0x0000;
	}

	int main(int argc, char **argv)
	{
	union {
	float f;
	uint32_t i;
	} u = { .i = 0 };

	int errors = 0;

	do {
	uint16_t hf_hw = float32_to_16_hw(u.f);
	uint16_t hf_sw = float32_to_16_sw(u.f);

	// Don't try to replicate the exact NaN bit patterns
	if (float16_isnan(hf_hw) && float16_isnan(hf_sw))
	continue;
	if (hf_hw == hf_sw)
	continue;

	printf("hw: %s -> %s\n", format_float32(u.f), format_float16(hf_hw));
	printf("sw: %s -> %s\n", format_float32(u.f), format_float16(hf_sw));
	if (++errors == 10)
	return 1;
	} while (++u.i);
	}