Skip to content

Instantly share code, notes, and snippets.

@equalent
Last active January 10, 2024 05:35
Show Gist options
  • Save equalent/35d9575747b9cfa05230230a0d8469d4 to your computer and use it in GitHub Desktop.
Save equalent/35d9575747b9cfa05230230a0d8469d4 to your computer and use it in GitHub Desktop.
Inverse square root implementation in plain C (q_rsqrt) and using SSE / NEON intrinsics
#pragma once
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) \
|| defined(__x86_64) || defined(_M_AMD64)
#define RSQRT_ARCH_AMD64
#define RSQRT_ARCH_X86
#endif
#if defined(i386) || defined(__i386) || defined(__i386__) \
|| defined(__i386__) || defined(__IA32__) || defined(_M_IX86)
#define RSQRT_ARCH_I386
#define RSQRT_ARCH_X86
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
#define RSQRT_ARCH_ARM64
#endif
#ifdef _MSC_VER
#define RSQRT_FASTCALL __fastcall
#define RSQRT_FORCEINLINE __forceinline
#else
#define RSQRT_FASTCALL
#define RSQRT_FORCEINLINE __attribute__((__always_inline__))
#endif
#if defined(RSQRT_ARCH_X86)
#include <xmmintrin.h>
#elif defined(RSQRT_ARCH_ARM64)
#ifdef _MSC_VER
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif // defined(RSQRT_ARCH_ARM64)
RSQRT_FORCEINLINE float RSQRT_FASTCALL rsqrt(float number)
{
#if defined(RSQRT_ARCH_X86)
__m128 n = _mm_set_ss(number);
n = _mm_rsqrt_ss(n);
return _mm_cvtss_f32(n);
#elif defined(RSQRT_ARCH_ARM64)
// initial estimate:
float32x2_t n = vrsqrte_f32(vdup_n_f32(number));
// increase accuracy using Newton-Raphson method:
n = vmul_f32(n, vrsqrts_f32(n, vmul_f32(n, vdup_n_f32(number))));
return vget_lane_f32(n, 0);
#else
// https://en.wikipedia.org/wiki/Fast_inverse_square_root#Worked_example
union {
float f;
uint32_t i;
} conv;
conv.f = number;
conv.i = 0x5f3759df - (conv.i >> 1);
conv.f *= 1.5F - (number * 0.5F * conv.f * conv.f);
return conv.f;
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment