Created
October 7, 2020 10:53
-
-
Save zhuker/b4bd1fb306c7b04975b712c37c4c4075 to your computer and use it in GitHub Desktop.
float16 to float32
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// based on https://gist.github.com/martin-kallman/5049614 | |
// float32 | |
// Martin Kallman | |
// | |
// Fast half-precision to single-precision floating point conversion | |
// - Supports signed zero and denormals-as-zero (DAZ) | |
// - Does not support infinities or NaN | |
// - Few, partially pipelinable, non-branching instructions, | |
// - Core opreations ~6 clock cycles on modern x86-64 | |
void float32(float *__restrict out, const uint16_t in) { | |
uint32_t t1; | |
uint32_t t2; | |
uint32_t t3; | |
t1 = in & 0x7fffu; // Non-sign bits | |
t2 = in & 0x8000u; // Sign bit | |
t3 = in & 0x7c00u; // Exponent | |
t1 <<= 13u; // Align mantissa on MSB | |
t2 <<= 16u; // Shift sign bit into position | |
t1 += 0x38000000; // Adjust bias | |
t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero | |
t1 |= t2; // Re-insert sign bit | |
*((uint32_t *) out) = t1; | |
}; | |
// float16 | |
// Martin Kallman | |
// | |
// Fast single-precision to half-precision floating point conversion | |
// - Supports signed zero, denormals-as-zero (DAZ), flush-to-zero (FTZ), | |
// clamp-to-max | |
// - Does not support infinities or NaN | |
// - Few, partially pipelinable, non-branching instructions, | |
// - Core opreations ~10 clock cycles on modern x86-64 | |
void float16(uint16_t *__restrict out, const float in) { | |
uint32_t inu = *((uint32_t * ) & in); | |
uint32_t t1; | |
uint32_t t2; | |
uint32_t t3; | |
t1 = inu & 0x7fffffffu; // Non-sign bits | |
t2 = inu & 0x80000000u; // Sign bit | |
t3 = inu & 0x7f800000u; // Exponent | |
t1 >>= 13u; // Align mantissa on MSB | |
t2 >>= 16u; // Shift sign bit into position | |
t1 -= 0x1c000; // Adjust bias | |
t1 = (t3 < 0x38800000u) ? 0 : t1; // Flush-to-zero | |
t1 = (t3 > 0x8e000000u) ? 0x7bff : t1; // Clamp-to-max | |
t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero | |
t1 |= t2; // Re-insert sign bit | |
*((uint16_t *) out) = t1; | |
}; | |
#define ABS(A) ((A) >= 0 ? (A) : -(A)) | |
int main() { | |
float original = -42.42f; | |
uint16_t small = 0; | |
float16(&small, original); | |
float quantized = 0.0f; | |
float32(&quantized, small); | |
float diff = ABS(original - quantized); | |
printf("orig %f quantized %f absdiff %f\n", original, quantized, diff); | |
assert(diff < 0.1f); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment