Skip to content

Instantly share code, notes, and snippets.

@bojle
Created October 11, 2025 15:27
Show Gist options
  • Select an option

  • Save bojle/60f9f9c0a7b0678a2f6b51553217ab6a to your computer and use it in GitHub Desktop.

Select an option

Save bojle/60f9f9c0a7b0678a2f6b51553217ab6a to your computer and use it in GitHub Desktop.
Integer division using Newton-Raphson Iterations with Fixed Point results.
// Find the blog on fp32.org/newton_raphson_division.html
#include <iostream>
#include <cmath>
#include <cstdint>
#include <type_traits>
#include <limits>
#include <cassert>
#include <numeric>
template <int FRAC_BITS, int TOTAL_BITS>
class FixedPoint {
using BaseType = decltype([] {
if constexpr (TOTAL_BITS <= 8) {
return int8_t{};
} else if constexpr (TOTAL_BITS <= 16) {
return int16_t{};
} else if constexpr (TOTAL_BITS <= 32) {
return int32_t{};
} else {
return int64_t{};
}
}());
using WideType = typename std::conditional<(sizeof(BaseType) <= 4), int64_t, __int128_t>::type;
BaseType value;
public:
constexpr static BaseType SCALE = static_cast<BaseType>(1ULL << FRAC_BITS);
FixedPoint() : value(0) {}
FixedPoint(float f) : value(static_cast<BaseType>(std::round(f * SCALE))) {}
BaseType raw() const { return value; }
static FixedPoint fromRaw(BaseType rawVal) {
FixedPoint fp;
fp.value = rawVal;
return fp;
}
float toFloat() const { return static_cast<float>(value) / SCALE; }
FixedPoint operator+(const FixedPoint &other) const {
return fromRaw(value + other.value);
}
FixedPoint operator-(const FixedPoint &other) const {
return fromRaw(value - other.value);
}
FixedPoint operator*(const FixedPoint &other) const {
WideType prod = static_cast<WideType>(value) * other.value;
return fromRaw(static_cast<BaseType>(prod >> FRAC_BITS));
}
FixedPoint operator*(float factor) const {
return FixedPoint(toFloat() * factor);
}
friend std::ostream &operator<<(std::ostream &os, const FixedPoint &fp) {
return os << fp.toFloat();
}
};
using Fx8_16 = FixedPoint<8, 16>;
using Fx16_32 = FixedPoint<16, 32>;
Fx8_16 fxdiv_corrected(int n, int d) {
assert(d != 0 && "Divide by zero undefined");
if (n == 0) return Fx8_16(0.0f);
bool result_is_negative = ((n < 0) != (d < 0));
uint32_t nv = static_cast<uint32_t>(std::abs(n));
uint32_t dv = static_cast<uint32_t>(std::abs(d));
// Normalization/scaling 'd' to fit between 0.5 and 1.0
int shift = std::countl_zero<uint32_t>(dv);
uint32_t scaled_val_raw = dv << shift;
uint32_t scaled_val_n_raw = nv << shift;
constexpr int INTERPRETATION_SHIFT = 31 - 16;
Fx16_32 d_scaled = Fx16_32::fromRaw(static_cast<int32_t>(scaled_val_raw >> INTERPRETATION_SHIFT));
Fx16_32 n_scaled = Fx16_32::fromRaw(static_cast<int32_t>(scaled_val_n_raw >> INTERPRETATION_SHIFT));
Fx16_32 a {2.8235294f};
Fx16_32 b {1.8823529f};
// Initial approximate calculation
Fx16_32 initial_approx = a - (b * d_scaled);
Fx16_32 val = initial_approx;
// Newton-Raphson iterations
val = val * (Fx16_32{2.f} - (d_scaled * val)); // E1
val = val * (Fx16_32{2.f} - (d_scaled * val)); // E2
val = val * (Fx16_32{2.f} - (d_scaled * val)); // E3
val = val * (Fx16_32{2.f} - (d_scaled * val)); // E4
Fx16_32 res_16_32 = n_scaled * val;
if (result_is_negative) {
// Simple negation (assumes FixedPoint has a working negation operator or multiply by -1)
res_16_32 = res_16_32 * -1.0f;
}
// Truncate from Fx16_32 (FRAC_BITS=16) to Fx8_16 (FRAC_BITS=8)
constexpr int TRUNCATION_SHIFT = 16 - 8;
// Shift the raw 32-bit value right by 8 bits to change the binary point position
int32_t raw_final_32 = res_16_32.raw();
int16_t raw_final_16 = static_cast<int16_t>(raw_final_32 >> TRUNCATION_SHIFT);
Fx8_16 final_res = Fx8_16::fromRaw(raw_final_16);
return final_res;
}
int main() {
std::cout.precision(10);
int n = 3;
int d = 4;
Fx8_16 result = fxdiv_corrected(n, d);
std::cout << "Approximate division of " << n << "/" << d << ": " << result << std::endl;
std::cout << "Real division of " << n << "/" << d << ": " << static_cast<float>(3)/static_cast<float>(4) << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment