Last active
October 20, 2021 22:57
-
-
Save ptomato/68c1fea85a07fe8b95d157a77db91b61 to your computer and use it in GitHub Desktop.
Converting float -> int128 with code adapted from compiler_rt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <inttypes.h> | |
#include <math.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <limits> | |
static inline uint64_t toRep(double x) { | |
const union { | |
double f; | |
uint64_t i; | |
} rep = {.f = x}; | |
return rep.i; | |
} | |
static constexpr int significandBits = std::numeric_limits<double>::digits - 1; | |
static constexpr int exponentBits = std::numeric_limits<uint64_t>::digits - std::numeric_limits<double>::digits; | |
static constexpr int exponentBias = std::numeric_limits<double>::max_exponent - 1; | |
static constexpr uint64_t implicitBit = 1UL << significandBits; | |
static constexpr uint64_t significandMask = implicitBit - 1UL; | |
static constexpr uint64_t signBit = 1UL << (significandBits + exponentBits); | |
static constexpr uint64_t absMask = signBit - 1UL; | |
static inline __int128_t fixdfti(double a) { | |
static constexpr __int128_t INT128_MAX = (~__int128_t{0}) / 2; | |
static constexpr __int128_t INT128_MIN = -INT128_MAX - 1; | |
// Break a into sign, exponent, significand parts. | |
const uint64_t aRep = toRep(a); | |
const uint64_t aAbs = aRep & absMask; | |
const __int128 sign = aRep & signBit ? -1 : 1; | |
const int exponent = (aAbs >> significandBits) - exponentBias; | |
const uint64_t significand = (aAbs & significandMask) | implicitBit; | |
// If exponent is negative, the result is zero. | |
if (exponent < 0) | |
return 0; | |
// If the value is too large for the integer type, saturate. | |
if (static_cast<unsigned>(exponent) >= 128) | |
return sign == 1 ? INT128_MAX : INT128_MIN; | |
// If 0 <= exponent < significandBits, right shift to get the result. | |
// Otherwise, shift left. | |
if (exponent < significandBits) | |
return sign * (significand >> (significandBits - exponent)); | |
else | |
return sign * (__int128_t{significand} << (exponent - significandBits)); | |
} | |
static inline __uint128_t fixunsdfti(double a) { | |
static constexpr __uint128_t UINT128_MAX = ~__uint128_t{0}; | |
// Break a into sign, exponent, significand parts. | |
const uint64_t aRep = toRep(a); | |
const uint64_t aAbs = aRep & absMask; | |
const int sign = aRep & signBit ? -1 : 1; | |
const int exponent = (aAbs >> significandBits) - exponentBias; | |
const uint64_t significand = (aAbs & significandMask) | implicitBit; | |
// If either the value or the exponent is negative, the result is zero. | |
if (sign == -1 || exponent < 0) | |
return 0; | |
// If the value is too large for the integer type, saturate. | |
if (static_cast<unsigned>(exponent) >= 2 * std::numeric_limits<uint64_t>::digits) | |
return UINT128_MAX; | |
// If 0 <= exponent < significandBits, right shift to get the result. | |
// Otherwise, shift left. | |
if (exponent < significandBits) | |
return significand >> (significandBits - exponent); | |
else | |
return __uint128_t{significand} << (exponent - significandBits); | |
} | |
void print_bits(const char* description, uint64_t high, uint64_t low) { | |
printf(" %10s: 0x%016" PRIx64 "_%016" PRIx64 "\n", description, high, low); | |
} | |
void print_bits(const char* description, __int128_t n) { | |
union U { | |
int64_t i; | |
uint64_t u; | |
}; | |
U high {.i = static_cast<int64_t>(n >> 64)}; | |
U low {.i = static_cast<int64_t>(n & 0xffff'ffff'ffff'ffff)}; | |
print_bits(description, high.u, low.u); | |
} | |
void print_bits(const char* description, __uint128_t n) { | |
print_bits(description, | |
static_cast<uint64_t>(n >> 64), | |
static_cast<uint64_t>(n & 0xffff'ffff'ffff'ffff)); | |
} | |
void do_test(const char* description, double f) { | |
printf("%s\n", description); | |
printf(" as double: %g (bits: 0x%016" PRIx64 ")\n", f, toRep(f)); | |
__int128_t i1 = f; | |
print_bits("as int128", i1); | |
__int128_t i2 = fixdfti(f); | |
print_bits("fixdfti", i2); | |
__uint128_t u1 = f; | |
print_bits("as uint128", u1); | |
__uint128_t u2 = fixunsdfti(f); | |
print_bits("fixunsdfti", u2); | |
printf("\n"); | |
} | |
int main(void) { | |
static const double exp64 = pow(2, 64); | |
static const double exp96 = pow(2, 96); | |
static const double exp128 = pow(2, 128); | |
do_test("zero", 0.0); | |
do_test("0 < number < 1", 1.0 / M_PI); | |
do_test("-1 < number < 0", -1.0 / M_E); | |
do_test("0 < number < 2^64", 1.0 / M_PI * exp64); | |
do_test("0 < -number < 2^64", -1.0 / M_E * exp64); | |
do_test("2^64", exp64); | |
do_test("-2^64", -exp64); | |
do_test("2^64 < fp number < 2^128", M_PI * exp96); | |
do_test("2^64 < -fp number < 2^128", -M_E * exp96); | |
do_test("2^128", exp128); | |
do_test("-2^128", -exp128); | |
do_test("number > uint128_max", M_PI * exp128); | |
do_test("-number > uint128_max", -M_E * exp128); | |
return 0; | |
} | |
// zero | |
// as double: 0 (bits: 0x0000000000000000) | |
// as int128: 0x0000000000000000_0000000000000000 | |
// fixdfti: 0x0000000000000000_0000000000000000 | |
// as uint128: 0x0000000000000000_0000000000000000 | |
// fixunsdfti: 0x0000000000000000_0000000000000000 | |
// 0 < number < 1 | |
// as double: 0.31831 (bits: 0x3fd45f306dc9c883) | |
// as int128: 0x0000000000000000_0000000000000000 | |
// fixdfti: 0x0000000000000000_0000000000000000 | |
// as uint128: 0x0000000000000000_0000000000000000 | |
// fixunsdfti: 0x0000000000000000_0000000000000000 | |
// -1 < number < 0 | |
// as double: -0.367879 (bits: 0xbfd78b56362cef38) | |
// as int128: 0x0000000000000000_0000000000000000 | |
// fixdfti: 0x0000000000000000_0000000000000000 | |
// as uint128: 0x0000000000000000_0000000000000000 | |
// fixunsdfti: 0x0000000000000000_0000000000000000 | |
// 0 < number < 2^64 | |
// as double: 5.87178e+18 (bits: 0x43d45f306dc9c883) | |
// as int128: 0x0000000000000000_517cc1b727220c00 | |
// fixdfti: 0x0000000000000000_517cc1b727220c00 | |
// as uint128: 0x0000000000000000_517cc1b727220c00 | |
// fixunsdfti: 0x0000000000000000_517cc1b727220c00 | |
// 0 < -number < 2^64 | |
// as double: -6.78618e+18 (bits: 0xc3d78b56362cef38) | |
// as int128: 0xffffffffffffffff_a1d2a7274c432000 | |
// fixdfti: 0xffffffffffffffff_a1d2a7274c432000 | |
// as uint128: 0x0000000000000000_a1d2a7274c432000 | |
// fixunsdfti: 0x0000000000000000_0000000000000000 | |
// 2^64 | |
// as double: 1.84467e+19 (bits: 0x43f0000000000000) | |
// as int128: 0x0000000000000001_0000000000000000 | |
// fixdfti: 0x0000000000000001_0000000000000000 | |
// as uint128: 0x0000000000000001_0000000000000000 | |
// fixunsdfti: 0x0000000000000001_0000000000000000 | |
// -2^64 | |
// as double: -1.84467e+19 (bits: 0xc3f0000000000000) | |
// as int128: 0xffffffffffffffff_0000000000000000 | |
// fixdfti: 0xffffffffffffffff_0000000000000000 | |
// as uint128: 0xffffffffffffffff_8000000000000000 | |
// fixunsdfti: 0x0000000000000000_0000000000000000 | |
// 2^64 < fp number < 2^128 | |
// as double: 2.48903e+29 (bits: 0x460921fb54442d18) | |
// as int128: 0x00000003243f6a88_85a3000000000000 | |
// fixdfti: 0x00000003243f6a88_85a3000000000000 | |
// as uint128: 0x00000003243f6a88_85a3000000000000 | |
// fixunsdfti: 0x00000003243f6a88_85a3000000000000 | |
// 2^64 < -fp number < 2^128 | |
// as double: -2.15364e+29 (bits: 0xc605bf0a8b145769) | |
// as int128: 0xfffffffd481eae9d_7512e00000000000 | |
// fixdfti: 0xfffffffd481eae9d_7512e00000000000 | |
// as uint128: 0xfffffffd481eae9e_8000000000000000 | |
// fixunsdfti: 0x0000000000000000_0000000000000000 | |
// 2^128 | |
// as double: 3.40282e+38 (bits: 0x47f0000000000000) | |
// as int128: 0x0000000000000000_0000000000000000 | |
// fixdfti: 0x0000000000000000_0000000000000000 | |
// as uint128: 0x0000000000000000_0000000000000000 | |
// fixunsdfti: 0xffffffffffffffff_ffffffffffffffff | |
// -2^128 | |
// as double: -3.40282e+38 (bits: 0xc7f0000000000000) | |
// as int128: 0x0000000000000000_0000000000000000 | |
// fixdfti: 0xffffffffffffffff_ffffffffffffffff | |
// as uint128: 0x8000000000000000_8000000000000000 | |
// fixunsdfti: 0x0000000000000000_0000000000000000 | |
// number > uint128_max | |
// as double: 1.06903e+39 (bits: 0x480921fb54442d18) | |
// as int128: 0x0000000000000000_0000000000000000 | |
// fixdfti: 0x0000000000000000_0000000000000000 | |
// as uint128: 0x0000000000000000_0000000000000000 | |
// fixunsdfti: 0xffffffffffffffff_ffffffffffffffff | |
// -number > uint128_max | |
// as double: -9.24983e+38 (bits: 0xc805bf0a8b145769) | |
// as int128: 0x0000000000000000_0000000000000000 | |
// fixdfti: 0xffffffffffffffff_ffffffffffffffff | |
// as uint128: 0x8000000000000000_8000000000000000 | |
// fixunsdfti: 0x0000000000000000_0000000000000000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment