Skip to content

Instantly share code, notes, and snippets.

@rossy
Created September 21, 2017 12:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rossy/67be6a23bfd3b2375286e05dadefb42c to your computer and use it in GitHub Desktop.
Save rossy/67be6a23bfd3b2375286e05dadefb42c to your computer and use it in GitHub Desktop.
// gcc -O2 -Wall -std=c99 -mf16c float16.c -o float16
#include <stdbool.h>
#include <inttypes.h>
#include <immintrin.h>
#include <stdio.h>
#include <math.h>
static char *format_float32_buf(float f, char buf[static 35])
{
union {
float f;
uint32_t i;
} u = { .f = f };
char *p = buf;
for (int i = 0; i < 32; i++) {
*p++ = ((u.i >> (31 - i)) & 1) ? '1' : '0';
if (i == 0 || i == 8)
*p++ = ' ';
}
*p++ = '\0';
return buf;
}
#define format_float32(f) format_float32_buf(f, (char[35]) { 0 })
static char *format_float16_buf(uint16_t f, char buf[static 19])
{
char *p = buf;
for (int i = 0; i < 16; i++) {
*p++ = ((f >> (15 - i)) & 1) ? '1' : '0';
if (i == 0 || i == 5)
*p++ = ' ';
}
*p++ = '\0';
return buf;
}
#define format_float16(f) format_float16_buf(f, (char[19]) { 0 })
static uint16_t float32_to_16_hw(float f)
{
return _cvtss_sh(f, 0);
}
static uint16_t float32_to_16_sw(float f)
{
uint16_t sign = signbit(f) ? 0x8000 : 0x0000;
switch (fpclassify(f)) {
case FP_INFINITE:
// Handle +/-Inf as a special case
return 0x7c00 | sign;
case FP_NAN:
// Return a NaN with the non-signalling bit set
return 0x7e00 | sign;
case FP_SUBNORMAL:
case FP_ZERO:
// Zeros need to be special-cased too. Handle subnormals as well, since
// float32 subnormals are so small that they will always round to zero.
return 0x0000 | sign;
}
// Extract the exponent and a normalized mantissa, with value [0.5; 1)
int exp;
float norm_mant = fabsf(frexpf(f, &exp));
// The normalized mantissa will be multiplied by 2^mant_exp. In a normal
// float, this will be 2^11, for 10 bits of mantissa and the implied
// leading '1' bit. In a subnormal float, mant_exp will take a lower value.
int mant_exp = 11;
// Handle subnormals
if (exp < -13) {
mant_exp = exp + 24;
exp = -13;
}
// If the exponent is this high, clamp to +/-Inf to prevent overflow
if (exp > 16)
return 0x7c00 | sign;
// Get the final mantissa. This uses the current float rounding mode.
uint16_t mant = lrintf(ldexpf(norm_mant, mant_exp));
// Note: exp_biased is lower than the final exponent for a float16, since
// we rely on the mantissa to overflow into the exponent to handle
// rounding. For normal floats, the implied leading '1' bit will also
// overflow into the exponent, so the real exponent bias is 14.
uint16_t exp_biased = exp + 13;
return (mant + (exp_biased << 10)) | sign;
}
static bool float16_isnan(uint16_t f)
{
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0x0000;
}
int main(int argc, char **argv)
{
union {
float f;
uint32_t i;
} u = { .i = 0 };
int errors = 0;
do {
uint16_t hf_hw = float32_to_16_hw(u.f);
uint16_t hf_sw = float32_to_16_sw(u.f);
// Don't try to replicate the exact NaN bit patterns
if (float16_isnan(hf_hw) && float16_isnan(hf_sw))
continue;
if (hf_hw == hf_sw)
continue;
printf("hw: %s -> %s\n", format_float32(u.f), format_float16(hf_hw));
printf("sw: %s -> %s\n", format_float32(u.f), format_float16(hf_sw));
if (++errors == 10)
return 1;
} while (++u.i);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment