Skip to content

Instantly share code, notes, and snippets.

@zhuker
Created October 7, 2020 10:53
Show Gist options
  • Save zhuker/b4bd1fb306c7b04975b712c37c4c4075 to your computer and use it in GitHub Desktop.
Save zhuker/b4bd1fb306c7b04975b712c37c4c4075 to your computer and use it in GitHub Desktop.
float16 to float32
// based on https://gist.github.com/martin-kallman/5049614
// float32
// Martin Kallman
//
// Fast half-precision to single-precision floating point conversion
// - Supports signed zero and denormals-as-zero (DAZ)
// - Does not support infinities or NaN
// - Few, partially pipelinable, non-branching instructions,
// - Core opreations ~6 clock cycles on modern x86-64
void float32(float *__restrict out, const uint16_t in) {
uint32_t t1;
uint32_t t2;
uint32_t t3;
t1 = in & 0x7fffu; // Non-sign bits
t2 = in & 0x8000u; // Sign bit
t3 = in & 0x7c00u; // Exponent
t1 <<= 13u; // Align mantissa on MSB
t2 <<= 16u; // Shift sign bit into position
t1 += 0x38000000; // Adjust bias
t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero
t1 |= t2; // Re-insert sign bit
*((uint32_t *) out) = t1;
};
// float16
// Martin Kallman
//
// Fast single-precision to half-precision floating point conversion
// - Supports signed zero, denormals-as-zero (DAZ), flush-to-zero (FTZ),
// clamp-to-max
// - Does not support infinities or NaN
// - Few, partially pipelinable, non-branching instructions,
// - Core opreations ~10 clock cycles on modern x86-64
void float16(uint16_t *__restrict out, const float in) {
uint32_t inu = *((uint32_t * ) & in);
uint32_t t1;
uint32_t t2;
uint32_t t3;
t1 = inu & 0x7fffffffu; // Non-sign bits
t2 = inu & 0x80000000u; // Sign bit
t3 = inu & 0x7f800000u; // Exponent
t1 >>= 13u; // Align mantissa on MSB
t2 >>= 16u; // Shift sign bit into position
t1 -= 0x1c000; // Adjust bias
t1 = (t3 < 0x38800000u) ? 0 : t1; // Flush-to-zero
t1 = (t3 > 0x8e000000u) ? 0x7bff : t1; // Clamp-to-max
t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero
t1 |= t2; // Re-insert sign bit
*((uint16_t *) out) = t1;
};
#define ABS(A) ((A) >= 0 ? (A) : -(A))
int main() {
float original = -42.42f;
uint16_t small = 0;
float16(&small, original);
float quantized = 0.0f;
float32(&quantized, small);
float diff = ABS(original - quantized);
printf("orig %f quantized %f absdiff %f\n", original, quantized, diff);
assert(diff < 0.1f);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment