Created
October 11, 2025 15:27
-
-
Save bojle/60f9f9c0a7b0678a2f6b51553217ab6a to your computer and use it in GitHub Desktop.
Integer division using Newton-Raphson Iterations with Fixed Point results.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Find the blog on fp32.org/newton_raphson_division.html | |
| #include <iostream> | |
| #include <cmath> | |
| #include <cstdint> | |
| #include <type_traits> | |
| #include <limits> | |
| #include <cassert> | |
| #include <numeric> | |
| template <int FRAC_BITS, int TOTAL_BITS> | |
| class FixedPoint { | |
| using BaseType = decltype([] { | |
| if constexpr (TOTAL_BITS <= 8) { | |
| return int8_t{}; | |
| } else if constexpr (TOTAL_BITS <= 16) { | |
| return int16_t{}; | |
| } else if constexpr (TOTAL_BITS <= 32) { | |
| return int32_t{}; | |
| } else { | |
| return int64_t{}; | |
| } | |
| }()); | |
| using WideType = typename std::conditional<(sizeof(BaseType) <= 4), int64_t, __int128_t>::type; | |
| BaseType value; | |
| public: | |
| constexpr static BaseType SCALE = static_cast<BaseType>(1ULL << FRAC_BITS); | |
| FixedPoint() : value(0) {} | |
| FixedPoint(float f) : value(static_cast<BaseType>(std::round(f * SCALE))) {} | |
| BaseType raw() const { return value; } | |
| static FixedPoint fromRaw(BaseType rawVal) { | |
| FixedPoint fp; | |
| fp.value = rawVal; | |
| return fp; | |
| } | |
| float toFloat() const { return static_cast<float>(value) / SCALE; } | |
| FixedPoint operator+(const FixedPoint &other) const { | |
| return fromRaw(value + other.value); | |
| } | |
| FixedPoint operator-(const FixedPoint &other) const { | |
| return fromRaw(value - other.value); | |
| } | |
| FixedPoint operator*(const FixedPoint &other) const { | |
| WideType prod = static_cast<WideType>(value) * other.value; | |
| return fromRaw(static_cast<BaseType>(prod >> FRAC_BITS)); | |
| } | |
| FixedPoint operator*(float factor) const { | |
| return FixedPoint(toFloat() * factor); | |
| } | |
| friend std::ostream &operator<<(std::ostream &os, const FixedPoint &fp) { | |
| return os << fp.toFloat(); | |
| } | |
| }; | |
| using Fx8_16 = FixedPoint<8, 16>; | |
| using Fx16_32 = FixedPoint<16, 32>; | |
| Fx8_16 fxdiv_corrected(int n, int d) { | |
| assert(d != 0 && "Divide by zero undefined"); | |
| if (n == 0) return Fx8_16(0.0f); | |
| bool result_is_negative = ((n < 0) != (d < 0)); | |
| uint32_t nv = static_cast<uint32_t>(std::abs(n)); | |
| uint32_t dv = static_cast<uint32_t>(std::abs(d)); | |
| // Normalization/scaling 'd' to fit between 0.5 and 1.0 | |
| int shift = std::countl_zero<uint32_t>(dv); | |
| uint32_t scaled_val_raw = dv << shift; | |
| uint32_t scaled_val_n_raw = nv << shift; | |
| constexpr int INTERPRETATION_SHIFT = 31 - 16; | |
| Fx16_32 d_scaled = Fx16_32::fromRaw(static_cast<int32_t>(scaled_val_raw >> INTERPRETATION_SHIFT)); | |
| Fx16_32 n_scaled = Fx16_32::fromRaw(static_cast<int32_t>(scaled_val_n_raw >> INTERPRETATION_SHIFT)); | |
| Fx16_32 a {2.8235294f}; | |
| Fx16_32 b {1.8823529f}; | |
| // Initial approximate calculation | |
| Fx16_32 initial_approx = a - (b * d_scaled); | |
| Fx16_32 val = initial_approx; | |
| // Newton-Raphson iterations | |
| val = val * (Fx16_32{2.f} - (d_scaled * val)); // E1 | |
| val = val * (Fx16_32{2.f} - (d_scaled * val)); // E2 | |
| val = val * (Fx16_32{2.f} - (d_scaled * val)); // E3 | |
| val = val * (Fx16_32{2.f} - (d_scaled * val)); // E4 | |
| Fx16_32 res_16_32 = n_scaled * val; | |
| if (result_is_negative) { | |
| // Simple negation (assumes FixedPoint has a working negation operator or multiply by -1) | |
| res_16_32 = res_16_32 * -1.0f; | |
| } | |
| // Truncate from Fx16_32 (FRAC_BITS=16) to Fx8_16 (FRAC_BITS=8) | |
| constexpr int TRUNCATION_SHIFT = 16 - 8; | |
| // Shift the raw 32-bit value right by 8 bits to change the binary point position | |
| int32_t raw_final_32 = res_16_32.raw(); | |
| int16_t raw_final_16 = static_cast<int16_t>(raw_final_32 >> TRUNCATION_SHIFT); | |
| Fx8_16 final_res = Fx8_16::fromRaw(raw_final_16); | |
| return final_res; | |
| } | |
| int main() { | |
| std::cout.precision(10); | |
| int n = 3; | |
| int d = 4; | |
| Fx8_16 result = fxdiv_corrected(n, d); | |
| std::cout << "Approximate division of " << n << "/" << d << ": " << result << std::endl; | |
| std::cout << "Real division of " << n << "/" << d << ": " << static_cast<float>(3)/static_cast<float>(4) << std::endl; | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment