Skip to content

Instantly share code, notes, and snippets.

@liangfu
Last active January 4, 2022 23:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save liangfu/19b848da2757d9b8a596bdb5182b6095 to your computer and use it in GitHub Desktop.
Save liangfu/19b848da2757d9b8a596bdb5182b6095 to your computer and use it in GitHub Desktop.
An implementation of IEEE 16-bit floating point data type
#include <stdio.h>
#include <inttypes.h>
static const uint32_t kSingleSignMask = 0x80000000;
static const uint32_t kSingleExpMask = 0x7f800000;
static const uint32_t kSingleMantMask = 0x007fffff;
static const uint32_t kHalfSignMask = 0x8000;
static const uint32_t kHalfExpMask = 0x7c00;
static const uint32_t kHalfMantMask = 0x03ff;
/*! \brief Implement of IEEE 16-bit floating point data type */
struct half {
uint16_t data;
half() {
data = 0;
}
half(uint16_t a) {
data = a;
}
operator double() const {
return to_float();
}
operator float() const {
return to_float();
}
// Referring to https://github.com/ramenhut/half/blob/master/half.h
float to_float() const {
if (0 == data) { return 0.0f; }
else if (0x8000 == data) { return -0.0f; }
uint32_t sig_bits = ((data) >> 15);
uint32_t exp_bits = ((data) >> 10) & 0x1f;
uint32_t man_bits = data & 0x03ff;
uint32_t a = ((sig_bits << 31) & kSingleSignMask);
exp_bits = exp_bits - 15 + 127;
a = ((exp_bits << 23) & kSingleExpMask) | (a & kSingleSignMask);
man_bits = man_bits << 13;
a = (man_bits & kSingleMantMask) | (a & (kSingleSignMask | kSingleExpMask));
printf("0x%x (0x%x, 0x%x, 0x%x)\n", a, sig_bits, exp_bits, man_bits);
return reinterpret_cast<float*>(&a)[0];
}
};
/*! \brief Implement of Google's 16-bit floating point data type */
struct bfloat16 {
uint16_t data;
operator double() const {
return to_float();
}
operator float() const {
return to_float();
}
float to_float() const {
uint32_t a = data << 16;
return reinterpret_cast<float*>(&a)[0];
}
};
half GetHalf(uint32_t sig_bits, uint32_t exp_bits, uint32_t man_bits) {
uint16_t a = ((sig_bits << 15) & kHalfSignMask);
a = ((exp_bits << 10) & kHalfExpMask) | (a & kHalfSignMask);
a = (man_bits & kHalfMantMask) | (a & (kHalfSignMask | kHalfExpMask));
printf("0x%x (0x%x, 0x%x, 0x%x)\n", a, sig_bits, exp_bits, man_bits);
half h;
h.data = a;
return h;
}
bfloat16 GetBFloat16(uint32_t sig_bits, uint32_t exp_bits, uint32_t man_bits) {
uint32_t a = ((sig_bits << 31) & kSingleSignMask);
a = ((exp_bits << 23) & kSingleExpMask) | (a & kSingleSignMask);
man_bits = man_bits << 16;
a = (man_bits & kSingleMantMask) | (a & (kSingleSignMask | kSingleExpMask));
printf("0x%x (0x%x, 0x%x, 0x%x)\n", a, sig_bits, exp_bits, man_bits);
bfloat16 h;
h.data = a >> 16;
return h;
}
int main() {
half a = GetHalf(0, 0b10000, 0x03ff);
double d = a;
float f = a;
uint32_t u = reinterpret_cast<uint32_t*>(&f)[0];
printf("%g\n", d);
printf("%g\n", f);
printf("0x%x\n", u);
printf("0x%x\n", u >> 23);
printf("--\n");
fflush(stdout);
a = GetHalf(1, 0b10000, 0x03ff);
d = a;
f = a;
u = reinterpret_cast<uint32_t*>(&f)[0];
printf("%g\n", d);
printf("%g\n", f);
printf("0x%x\n", u);
printf("0x%x\n", u >> 23);
printf("--\n");
fflush(stdout);
float g = -0.165894;
bfloat16 b;
b.data = reinterpret_cast<uint32_t*>(&g)[0] >> 16;
f = b;
u = reinterpret_cast<uint32_t*>(&f)[0];
printf("%g\n", g);
printf("%g\n", f);
printf("0x%x\n", u);
printf("--\n");
fflush(stdout);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment