Skip to content

Instantly share code, notes, and snippets.

@ptomato
Last active October 20, 2021 22:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ptomato/68c1fea85a07fe8b95d157a77db91b61 to your computer and use it in GitHub Desktop.
Save ptomato/68c1fea85a07fe8b95d157a77db91b61 to your computer and use it in GitHub Desktop.
Converting float -> int128 with code adapted from compiler_rt
#include <inttypes.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <limits>
static inline uint64_t toRep(double x) {
const union {
double f;
uint64_t i;
} rep = {.f = x};
return rep.i;
}
static constexpr int significandBits = std::numeric_limits<double>::digits - 1;
static constexpr int exponentBits = std::numeric_limits<uint64_t>::digits - std::numeric_limits<double>::digits;
static constexpr int exponentBias = std::numeric_limits<double>::max_exponent - 1;
static constexpr uint64_t implicitBit = 1UL << significandBits;
static constexpr uint64_t significandMask = implicitBit - 1UL;
static constexpr uint64_t signBit = 1UL << (significandBits + exponentBits);
static constexpr uint64_t absMask = signBit - 1UL;
static inline __int128_t fixdfti(double a) {
static constexpr __int128_t INT128_MAX = (~__int128_t{0}) / 2;
static constexpr __int128_t INT128_MIN = -INT128_MAX - 1;
// Break a into sign, exponent, significand parts.
const uint64_t aRep = toRep(a);
const uint64_t aAbs = aRep & absMask;
const __int128 sign = aRep & signBit ? -1 : 1;
const int exponent = (aAbs >> significandBits) - exponentBias;
const uint64_t significand = (aAbs & significandMask) | implicitBit;
// If exponent is negative, the result is zero.
if (exponent < 0)
return 0;
// If the value is too large for the integer type, saturate.
if (static_cast<unsigned>(exponent) >= 128)
return sign == 1 ? INT128_MAX : INT128_MIN;
// If 0 <= exponent < significandBits, right shift to get the result.
// Otherwise, shift left.
if (exponent < significandBits)
return sign * (significand >> (significandBits - exponent));
else
return sign * (__int128_t{significand} << (exponent - significandBits));
}
static inline __uint128_t fixunsdfti(double a) {
static constexpr __uint128_t UINT128_MAX = ~__uint128_t{0};
// Break a into sign, exponent, significand parts.
const uint64_t aRep = toRep(a);
const uint64_t aAbs = aRep & absMask;
const int sign = aRep & signBit ? -1 : 1;
const int exponent = (aAbs >> significandBits) - exponentBias;
const uint64_t significand = (aAbs & significandMask) | implicitBit;
// If either the value or the exponent is negative, the result is zero.
if (sign == -1 || exponent < 0)
return 0;
// If the value is too large for the integer type, saturate.
if (static_cast<unsigned>(exponent) >= 2 * std::numeric_limits<uint64_t>::digits)
return UINT128_MAX;
// If 0 <= exponent < significandBits, right shift to get the result.
// Otherwise, shift left.
if (exponent < significandBits)
return significand >> (significandBits - exponent);
else
return __uint128_t{significand} << (exponent - significandBits);
}
void print_bits(const char* description, uint64_t high, uint64_t low) {
printf(" %10s: 0x%016" PRIx64 "_%016" PRIx64 "\n", description, high, low);
}
void print_bits(const char* description, __int128_t n) {
union U {
int64_t i;
uint64_t u;
};
U high {.i = static_cast<int64_t>(n >> 64)};
U low {.i = static_cast<int64_t>(n & 0xffff'ffff'ffff'ffff)};
print_bits(description, high.u, low.u);
}
void print_bits(const char* description, __uint128_t n) {
print_bits(description,
static_cast<uint64_t>(n >> 64),
static_cast<uint64_t>(n & 0xffff'ffff'ffff'ffff));
}
void do_test(const char* description, double f) {
printf("%s\n", description);
printf(" as double: %g (bits: 0x%016" PRIx64 ")\n", f, toRep(f));
__int128_t i1 = f;
print_bits("as int128", i1);
__int128_t i2 = fixdfti(f);
print_bits("fixdfti", i2);
__uint128_t u1 = f;
print_bits("as uint128", u1);
__uint128_t u2 = fixunsdfti(f);
print_bits("fixunsdfti", u2);
printf("\n");
}
int main(void) {
static const double exp64 = pow(2, 64);
static const double exp96 = pow(2, 96);
static const double exp128 = pow(2, 128);
do_test("zero", 0.0);
do_test("0 < number < 1", 1.0 / M_PI);
do_test("-1 < number < 0", -1.0 / M_E);
do_test("0 < number < 2^64", 1.0 / M_PI * exp64);
do_test("0 < -number < 2^64", -1.0 / M_E * exp64);
do_test("2^64", exp64);
do_test("-2^64", -exp64);
do_test("2^64 < fp number < 2^128", M_PI * exp96);
do_test("2^64 < -fp number < 2^128", -M_E * exp96);
do_test("2^128", exp128);
do_test("-2^128", -exp128);
do_test("number > uint128_max", M_PI * exp128);
do_test("-number > uint128_max", -M_E * exp128);
return 0;
}
// zero
// as double: 0 (bits: 0x0000000000000000)
// as int128: 0x0000000000000000_0000000000000000
// fixdfti: 0x0000000000000000_0000000000000000
// as uint128: 0x0000000000000000_0000000000000000
// fixunsdfti: 0x0000000000000000_0000000000000000
// 0 < number < 1
// as double: 0.31831 (bits: 0x3fd45f306dc9c883)
// as int128: 0x0000000000000000_0000000000000000
// fixdfti: 0x0000000000000000_0000000000000000
// as uint128: 0x0000000000000000_0000000000000000
// fixunsdfti: 0x0000000000000000_0000000000000000
// -1 < number < 0
// as double: -0.367879 (bits: 0xbfd78b56362cef38)
// as int128: 0x0000000000000000_0000000000000000
// fixdfti: 0x0000000000000000_0000000000000000
// as uint128: 0x0000000000000000_0000000000000000
// fixunsdfti: 0x0000000000000000_0000000000000000
// 0 < number < 2^64
// as double: 5.87178e+18 (bits: 0x43d45f306dc9c883)
// as int128: 0x0000000000000000_517cc1b727220c00
// fixdfti: 0x0000000000000000_517cc1b727220c00
// as uint128: 0x0000000000000000_517cc1b727220c00
// fixunsdfti: 0x0000000000000000_517cc1b727220c00
// 0 < -number < 2^64
// as double: -6.78618e+18 (bits: 0xc3d78b56362cef38)
// as int128: 0xffffffffffffffff_a1d2a7274c432000
// fixdfti: 0xffffffffffffffff_a1d2a7274c432000
// as uint128: 0x0000000000000000_a1d2a7274c432000
// fixunsdfti: 0x0000000000000000_0000000000000000
// 2^64
// as double: 1.84467e+19 (bits: 0x43f0000000000000)
// as int128: 0x0000000000000001_0000000000000000
// fixdfti: 0x0000000000000001_0000000000000000
// as uint128: 0x0000000000000001_0000000000000000
// fixunsdfti: 0x0000000000000001_0000000000000000
// -2^64
// as double: -1.84467e+19 (bits: 0xc3f0000000000000)
// as int128: 0xffffffffffffffff_0000000000000000
// fixdfti: 0xffffffffffffffff_0000000000000000
// as uint128: 0xffffffffffffffff_8000000000000000
// fixunsdfti: 0x0000000000000000_0000000000000000
// 2^64 < fp number < 2^128
// as double: 2.48903e+29 (bits: 0x460921fb54442d18)
// as int128: 0x00000003243f6a88_85a3000000000000
// fixdfti: 0x00000003243f6a88_85a3000000000000
// as uint128: 0x00000003243f6a88_85a3000000000000
// fixunsdfti: 0x00000003243f6a88_85a3000000000000
// 2^64 < -fp number < 2^128
// as double: -2.15364e+29 (bits: 0xc605bf0a8b145769)
// as int128: 0xfffffffd481eae9d_7512e00000000000
// fixdfti: 0xfffffffd481eae9d_7512e00000000000
// as uint128: 0xfffffffd481eae9e_8000000000000000
// fixunsdfti: 0x0000000000000000_0000000000000000
// 2^128
// as double: 3.40282e+38 (bits: 0x47f0000000000000)
// as int128: 0x0000000000000000_0000000000000000
// fixdfti: 0x0000000000000000_0000000000000000
// as uint128: 0x0000000000000000_0000000000000000
// fixunsdfti: 0xffffffffffffffff_ffffffffffffffff
// -2^128
// as double: -3.40282e+38 (bits: 0xc7f0000000000000)
// as int128: 0x0000000000000000_0000000000000000
// fixdfti: 0xffffffffffffffff_ffffffffffffffff
// as uint128: 0x8000000000000000_8000000000000000
// fixunsdfti: 0x0000000000000000_0000000000000000
// number > uint128_max
// as double: 1.06903e+39 (bits: 0x480921fb54442d18)
// as int128: 0x0000000000000000_0000000000000000
// fixdfti: 0x0000000000000000_0000000000000000
// as uint128: 0x0000000000000000_0000000000000000
// fixunsdfti: 0xffffffffffffffff_ffffffffffffffff
// -number > uint128_max
// as double: -9.24983e+38 (bits: 0xc805bf0a8b145769)
// as int128: 0x0000000000000000_0000000000000000
// fixdfti: 0xffffffffffffffff_ffffffffffffffff
// as uint128: 0x8000000000000000_8000000000000000
// fixunsdfti: 0x0000000000000000_0000000000000000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment