zhuker/half_float.c

## half_float.c
// based on https://gist.github.com/martin-kallman/5049614
// float32
// Martin Kallman
//
// Fast half-precision to single-precision floating point conversion
//  - Supports signed zero and denormals-as-zero (DAZ)
//  - Does not support infinities or NaN
//  - Few, partially pipelinable, non-branching instructions,
//  - Core opreations ~6 clock cycles on modern x86-64
void float32(float *__restrict out, const uint16_t in) {
    uint32_t t1;
    uint32_t t2;
    uint32_t t3;

    t1 = in & 0x7fffu;                       // Non-sign bits
    t2 = in & 0x8000u;                       // Sign bit
    t3 = in & 0x7c00u;                       // Exponent

    t1 <<= 13u;                              // Align mantissa on MSB
    t2 <<= 16u;                              // Shift sign bit into position

    t1 += 0x38000000;                       // Adjust bias

    t1 = (t3 == 0 ? 0 : t1);                // Denormals-as-zero

    t1 |= t2;                               // Re-insert sign bit

    *((uint32_t *) out) = t1;
};

// float16
// Martin Kallman
//
// Fast single-precision to half-precision floating point conversion
//  - Supports signed zero, denormals-as-zero (DAZ), flush-to-zero (FTZ),
//    clamp-to-max
//  - Does not support infinities or NaN
//  - Few, partially pipelinable, non-branching instructions,
//  - Core opreations ~10 clock cycles on modern x86-64
void float16(uint16_t *__restrict out, const float in) {
    uint32_t inu = *((uint32_t * ) & in);
    uint32_t t1;
    uint32_t t2;
    uint32_t t3;

    t1 = inu & 0x7fffffffu;                 // Non-sign bits
    t2 = inu & 0x80000000u;                 // Sign bit
    t3 = inu & 0x7f800000u;                 // Exponent

    t1 >>= 13u;                             // Align mantissa on MSB
    t2 >>= 16u;                             // Shift sign bit into position

    t1 -= 0x1c000;                         // Adjust bias

    t1 = (t3 < 0x38800000u) ? 0 : t1;       // Flush-to-zero
    t1 = (t3 > 0x8e000000u) ? 0x7bff : t1;  // Clamp-to-max
    t1 = (t3 == 0 ? 0 : t1);               // Denormals-as-zero

    t1 |= t2;                              // Re-insert sign bit

    *((uint16_t *) out) = t1;
};

#define ABS(A) ((A) >= 0 ? (A) : -(A))

int main() {
    float original = -42.42f;
    uint16_t small = 0;
    float16(&small, original);
    float quantized = 0.0f;
    float32(&quantized, small);
    float diff = ABS(original - quantized);
    printf("orig %f quantized %f absdiff %f\n", original, quantized, diff);
    assert(diff < 0.1f);
}
	// based on https://gist.github.com/martin-kallman/5049614
	// float32
	// Martin Kallman
	//
	// Fast half-precision to single-precision floating point conversion
	// - Supports signed zero and denormals-as-zero (DAZ)
	// - Does not support infinities or NaN
	// - Few, partially pipelinable, non-branching instructions,
	// - Core opreations ~6 clock cycles on modern x86-64
	void float32(float *__restrict out, const uint16_t in) {
	uint32_t t1;
	uint32_t t2;
	uint32_t t3;

	t1 = in & 0x7fffu; // Non-sign bits
	t2 = in & 0x8000u; // Sign bit
	t3 = in & 0x7c00u; // Exponent

	t1 <<= 13u; // Align mantissa on MSB
	t2 <<= 16u; // Shift sign bit into position

	t1 += 0x38000000; // Adjust bias

	t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero

	t1 \|= t2; // Re-insert sign bit

	((uint32_t ) out) = t1;
	};

	// float16
	// Martin Kallman
	//
	// Fast single-precision to half-precision floating point conversion
	// - Supports signed zero, denormals-as-zero (DAZ), flush-to-zero (FTZ),
	// clamp-to-max
	// - Does not support infinities or NaN
	// - Few, partially pipelinable, non-branching instructions,
	// - Core opreations ~10 clock cycles on modern x86-64
	void float16(uint16_t *__restrict out, const float in) {
	uint32_t inu = ((uint32_t ) & in);
	uint32_t t1;
	uint32_t t2;
	uint32_t t3;

	t1 = inu & 0x7fffffffu; // Non-sign bits
	t2 = inu & 0x80000000u; // Sign bit
	t3 = inu & 0x7f800000u; // Exponent

	t1 >>= 13u; // Align mantissa on MSB
	t2 >>= 16u; // Shift sign bit into position

	t1 -= 0x1c000; // Adjust bias

	t1 = (t3 < 0x38800000u) ? 0 : t1; // Flush-to-zero
	t1 = (t3 > 0x8e000000u) ? 0x7bff : t1; // Clamp-to-max
	t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero

	t1 \|= t2; // Re-insert sign bit

	((uint16_t ) out) = t1;
	};

	#define ABS(A) ((A) >= 0 ? (A) : -(A))

	int main() {
	float original = -42.42f;
	uint16_t small = 0;
	float16(&small, original);
	float quantized = 0.0f;
	float32(&quantized, small);
	float diff = ABS(original - quantized);
	printf("orig %f quantized %f absdiff %f\n", original, quantized, diff);
	assert(diff < 0.1f);
	}