Skip to content

Instantly share code, notes, and snippets.

@arnabanimesh
Last active April 28, 2022 20:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arnabanimesh/09d2a0a20d5be7ea07326543bb87c0b5 to your computer and use it in GitHub Desktop.
Save arnabanimesh/09d2a0a20d5be7ea07326543bb87c0b5 to your computer and use it in GitHub Desktop.
FISR speed test mentioned in the video: https://www.youtube.com/watch?v=Fm0vygzVXeE

Benchmark sqrt.rs (Command: rustc -O sqrt.rs)

Sanity checks:
============================================
Sqrt = 1000
isqrt = 1000
fisqrt = 998
intrsqrt = 1000
============================================
Integer Square Root      | Passes 177
Floating point unit      | Passes 2592
Fast Inverse Square Root | Passes 2583
Intrinsic SqrRoot        | Passes 3628
20.504412

Benchmark sqrt.cpp (Command: clang++ -Ofast sqrt.cpp)

Sanity checks:
============================================
Sqrt = 1000
isqrt = 1000
fisqrt = 998
intrsqrt = 1000
============================================
Integer Square Root      | Passes 150
Floating point unit      | Passes 1645
Fast Inverse Square Root | Passes 1911
Intrinsic SqrRoot        | Passes 3108
30.9367
#include <iostream>
#include <cmath>
#include <chrono>
#include <xmmintrin.h>
using namespace std;
using namespace std::chrono;
auto outputval = 0.0;
// sqrtintr - Use the intrinsics for SSE/SIMD rsqrt
__forceinline double sqrtintr(const double f)
{
__m128 temp = _mm_set_ss(f);
temp = _mm_rsqrt_ss(temp);
return _mm_cvtss_f32(temp);
}
// isqrt - 64 bit Unsigned Integer Square Root
uint64_t isqrt(uint64_t x)
{
uint64_t root = 0;
for (uint64_t Bit = 0x80000000L; Bit > 0; Bit >>= 1)
{
uint64_t trial = root + Bit;
if ((uint64_t)trial * (uint64_t)trial <= x)
root += Bit;
}
return root;
}
// fisqrt - Fast Inverse Square Root. Accepts and returns 32-bit precision float
__forceinline float fisqrt(float n)
{
const float threehalfs = 1.5F;
float y = n;
long i = *(long *)&y;
i = 0x5f3759df - (i >> 1);
y = *(float *)&i;
y = y * (threehalfs - ((n * 0.5F) * y * y));
// y = y * (threehalfs - ((n * 0.5F) * y * y));
return y;
}
// CountIDivisions - Counts integer square roots
uint64_t CountIDivisions(long long secondsToRun)
{
const auto tStart = steady_clock::now();
auto value = 0.0;
auto sum = 0.0;
auto passes = 0UL;
while ((duration_cast<seconds>(steady_clock::now() - tStart)).count() < secondsToRun)
{
for (int i = 0; i < 1000; i++)
{
value += ++passes;
sum += 1 / isqrt((uint64_t)value);
}
}
outputval = sum;
return passes;
}
// CountDivisions - Uses the normal compiler sqrt() function
uint64_t CountDivisions(long long secondsToRun)
{
const auto tStart = steady_clock::now();
auto value = 0.0;
auto sum = 0.0;
auto passes = 0UL;
while ((duration_cast<seconds>(steady_clock::now() - tStart)).count() < secondsToRun)
{
for (int i = 0; i < 1000; i++)
{
value += ++passes;
sum += 1 / sqrt(value);
}
}
outputval = sum;
return passes;
}
// CountIntrDivisions - Use the intrinsics for SSE/SIMD rsqrt
uint64_t CountIntrDivisions(long long secondsToRun)
{
const auto tStart = steady_clock::now();
auto value = 0.0;
auto sum = 0.0;
auto passes = 0UL;
while ((duration_cast<seconds>(steady_clock::now() - tStart)).count() < secondsToRun)
{
for (int i = 0; i < 1000; i++)
{
value += ++passes;
sum += sqrtintr(value);
}
}
outputval = sum;
return passes;
}
// CountFIDivisions - Test the FISR
uint64_t CountFIDivisions(long long secondsToRun)
{
const auto tStart = steady_clock::now();
auto value = 0.0;
auto sum = 0.0;
auto passes = 0UL;
while ((duration_cast<seconds>(steady_clock::now() - tStart)).count() < secondsToRun)
{
for (int i = 0; i < 1000; i++)
{
value += ++passes;
sum += fisqrt((float)value);
}
}
outputval = sum;
return passes;
}
int main(int argc, char *argv[])
{
cout << "Sanity checks:" << endl;
cout << "============================================" << endl;
cout << "Sqrt = " << round(sqrt(1000000)) << endl;
cout << "isqrt = " << round(isqrt(1000000)) << endl;
cout << "fisqrt = " << round(1000000 * fisqrt(1000000)) << endl;
cout << "intrsqrt = " << round(1000000 * sqrtintr(1000000)) << endl;
cout << "============================================" << endl;
auto seconds = 5;
auto cDiv = CountDivisions(seconds) / 1000000;
auto ciDiv = CountIDivisions(seconds) / 1000000;
auto fiDiv = CountFIDivisions(seconds) / 1000000;
auto intDiv = CountIntrDivisions(seconds) / 1000000;
cout << "Integer Square Root | Passes " << ciDiv << endl;
cout << "Floating point unit | Passes " << cDiv << endl;
cout << "Fast Inverse Square Root | Passes " << fiDiv << endl;
cout << "Intrinsic SqrRoot | Passes " << intDiv << endl;
cout << outputval << endl; // so that the compiler doesn't ignore the variable
}
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use std::time::Instant;
static mut OUT: f32 = 0.;
// sqrtintr - Use the intrinsics for SSE/SIMD rsqrt
#[inline(always)]
fn sqrtintr(f: f32) -> f32 {
if is_x86_feature_detected!("sse") {
unsafe { _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(f))) }
} else {
panic!("SSE feature not detected.")
}
}
// isqrt - 64 bit Unsigned Integer Square Root
fn isqrt(x: u64) -> u64 {
let mut root: u64 = 0;
let mut bit: u64 = 0x80000000;
while bit > 0 {
let trial = root + bit;
if trial * trial <= x {
root += bit;
}
bit >>= 1;
}
root
}
// fisqrt - Fast Inverse Square Root. Accepts and returns 32-bit precision float
#[inline(always)]
fn fisqrt(n: f32) -> f32 {
let y = f32::from_bits(0x5f3759df - (n.to_bits() >> 1));
// let y = y * (1.5 - ((n * 0.5) * y * y)); // Uncomment for higher accuracy but lower speed
y * (1.5 - ((n * 0.5) * y * y))
}
// CountIDivisions - Counts integer square roots
fn count_i_divisions(seconds_to_run: u64) -> u64 {
let t_start = Instant::now();
let mut value = 0;
let mut sum = 0.;
let mut passes = 0;
while t_start.elapsed().as_secs() < seconds_to_run {
for _ in 0..1000 {
passes += 1;
value += passes;
sum += (1 / isqrt(value)) as f32;
}
}
unsafe {
OUT = sum;
}
passes
}
// CountDivisions - Uses the normal compiler sqrt() function
fn count_divisions(seconds_to_run: u64) -> u64 {
let t_start = Instant::now();
let mut value = 0.;
let mut sum = 0.;
let mut passes = 0;
while t_start.elapsed().as_secs() < seconds_to_run {
for _ in 0..1000 {
passes += 1;
value += passes as f32;
sum += 1. / f32::sqrt(value);
}
}
unsafe {
OUT = sum;
}
passes
}
// CountIntrDivisions - Use the intrinsics for SSE/SIMD rsqrt
fn count_intr_divisions(seconds_to_run: u64) -> u64 {
let t_start = Instant::now();
let mut value = 0.;
let mut sum = 0.;
let mut passes = 0;
while t_start.elapsed().as_secs() < seconds_to_run {
for _ in 0..1000 {
passes += 1;
value += passes as f32;
sum += sqrtintr(value);
}
}
unsafe {
OUT = sum;
}
passes
}
// CountFIDivisions - Test the FISR
fn count_fi_divisions(seconds_to_run: u64) -> u64 {
let t_start = Instant::now();
let mut value = 0.;
let mut sum = 0.;
let mut passes = 0;
while t_start.elapsed().as_secs() < seconds_to_run {
for _ in 0..1000 {
passes += 1;
value += passes as f32;
sum += fisqrt(value);
}
}
unsafe {
OUT = sum;
}
passes
}
fn main() {
println!("Sanity checks:");
println!("============================================");
println!("Sqrt = {}", f32::sqrt(1000000.).round());
println!("isqrt = {}", isqrt(1000000));
println!("fisqrt = {}", (fisqrt(1000000.) * 1000000.).round());
println!("intrsqrt = {}", (sqrtintr(1000000.) * 1000000.).round());
println!("============================================");
let seconds = 5;
let c_div = count_divisions(seconds) / 1000000;
let ci_div = count_i_divisions(seconds) / 1000000;
let fi_div = count_fi_divisions(seconds) / 1000000;
let int_div = count_intr_divisions(seconds) / 1000000;
println!("Integer Square Root | Passes {}", ci_div);
println!("Floating point unit | Passes {}", c_div);
println!("Fast Inverse Square Root | Passes {}", fi_div);
println!("Intrinsic SqrRoot | Passes {}", int_div);
unsafe {
println!("{}", OUT); // so that the compiler doesn't ignore the variable
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment