Skip to content

Instantly share code, notes, and snippets.

@Marc-B-Reynolds
Last active August 28, 2023 10:07
Show Gist options
  • Star 9 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Marc-B-Reynolds/9fb24a7a4ee915e6e973bf9f4d08c404 to your computer and use it in GitHub Desktop.
Save Marc-B-Reynolds/9fb24a7a4ee915e6e973bf9f4d08c404 to your computer and use it in GitHub Desktop.
brute force testing of 1/sqrt functions
click for range breakdown

checking on [3f800000,40000000] [1.000000e+00,2.000000e+00]

func e max ULP CR FR 2 ULP > 2 ULP CR% FR% 2 ULP% > 2 ULP%
vrsqrte_f32 -- 4947968 103 225 216 8388065 0.001228 0.002682 0.002575 99.993515
FRSR_Mon0 -- 564177 3 8 6 8388592 0.000036 0.000095 0.000072 99.999797
FRSR_Deg0 -- 403258 0 0 0 8388609 0.000000 0.000000 0.000000 100.000000
FRSR_Mon1 -- 14751 230 464 466 8387449 0.002742 0.005531 0.005555 99.986172
FRSR_Deg1 -- 10215 321 602 609 8387077 0.003827 0.007176 0.007260 99.981737
FRSR_Deg1Alt -- 10219 305 602 589 8387113 0.003636 0.007176 0.007021 99.982166
rsqrt_sse -- 4980 1916 3792 3835 8379066 0.022840 0.045204 0.045717 99.886239
rsqrt_msh -- 1169 3067 6192 6203 8373147 0.036561 0.073814 0.073946 99.815679
rsqrt_day -- 317 9560 19141 19109 8340799 0.113964 0.228178 0.227797 99.430060
rsqrt_wmc -- 11 459691 998354 1126095 5804469 5.479943 11.901306 13.424097 69.194654
rsqrt_sse_r2 2.7195733655 3 6342724 1974273 71500 112 75.611153 23.535165 0.852346 0.001335
rsqrt_sse_r1 2.7000876479 3 6502216 1829513 56816 64 77.512446 21.809492 0.677299 0.000763
rsqrt_day_r1 0.8535512891 1 7109620 1278989 0 0 84.753265 15.246735 0.000000 0.000000
f32_rsqrt 0.8499174770 1 7109638 1278971 0 0 84.753479 15.246521 0.000000 0.000000
rsqrt_day_r2 0.5094854385 1 8353865 34744 0 0 99.585819 0.414181 0.000000 0.000000
rsqrt_wmc_r1 0.8518467862 1 7110734 1277875 0 0 84.766545 15.233455 0.000000 0.000000
rsqrt_sse_hm 0.5011967756 1 8387934 675 0 0 99.991953 0.008047 0.000000 0.000000
rsqrt_day_hm 0.5000192411 1 8388567 42 0 0 99.999499 0.000501 0.000000 0.000000
rsqrt_wmc_r2 0.5000088997 1 8388575 34 0 0 99.999595 0.000405 0.000000 0.000000
rsqrt_wmc_hm 0.5000000857 1 8388608 1 0 0 99.999988 0.000012 0.000000 0.000000
rsqrt_cb 0.4999999218 0 8388609 0 0 0 100.000000 0.000000 0.000000 0.000000
rsqrt_day_cr 0.4999999218 0 8388609 0 0 0 100.000000 0.000000 0.000000 0.000000
rsqrt_sse_cr 0.4999999218 0 8388609 0 0 0 100.000000 0.000000 0.000000 0.000000

checking on [40000000,40800000] [2.000000e+00,4.000000e+00]

func e max ULP CR FR 2 ULP > 2 ULP CR% FR% 2 ULP% > 2 ULP%
vrsqrte_f32 -- 4947968 115 251 252 8387991 0.001371 0.002992 0.003004 99.992633
FRSR_Mon0 -- 564177 6 12 11 8388580 0.000072 0.000143 0.000131 99.999654
FRSR_Deg0 -- 286518 11 26 24 8388548 0.000131 0.000310 0.000286 99.999273
FRSR_Mon1 -- 9631 85 129 167 8388228 0.001013 0.001538 0.001991 99.995458
FRSR_Deg1 -- 7339 409 644 745 8386811 0.004876 0.007677 0.008881 99.978566
FRSR_Deg1Alt -- 7338 362 716 723 8386808 0.004315 0.008535 0.008619 99.978530
rsqrt_sse -- 4096 2692 5396 5376 8375145 0.032091 0.064325 0.064087 99.839497
rsqrt_msh -- 827 4366 8757 8701 8366785 0.052047 0.104392 0.103724 99.739838
rsqrt_day -- 213 86538 175997 184719 7941355 1.031613 2.098047 2.202022 94.668317
rsqrt_wmc 8.0949323997 8 399307 816037 899005 6274260 4.760110 9.727918 10.716973 74.794999
rsqrt_sse_r2 2.3655200340 2 6681325 1680687 26597 0 79.647591 20.035348 0.317061 0.000000
rsqrt_sse_r1 2.3163667079 2 6717604 1652745 18260 0 80.080070 19.702253 0.217676 0.000000
rsqrt_day_r1 0.7492158674 1 7484765 903844 0 0 89.225341 10.774659 0.000000 0.000000
f32_rsqrt 0.7497656401 1 7485042 903567 0 0 89.228643 10.771357 0.000000 0.000000
rsqrt_day_r2 0.5062385015 1 8374395 14214 0 0 99.830556 0.169444 0.000000 0.000000
rsqrt_wmc_r1 0.7494520154 1 7485268 903341 0 0 89.231337 10.768663 0.000000 0.000000
rsqrt_sse_hm 0.5008518882 1 8388060 549 0 0 99.993455 0.006545 0.000000 0.000000
rsqrt_day_hm 0.5000102632 1 8388582 27 0 0 99.999678 0.000322 0.000000 0.000000
rsqrt_wmc_r2 0.5000051744 1 8388583 26 0 0 99.999690 0.000310 0.000000 0.000000
rsqrt_wmc_hm 0.5000000447 1 8388607 2 0 0 99.999976 0.000024 0.000000 0.000000
rsqrt_cb 0.4999999981 0 8388609 0 0 0 100.000000 0.000000 0.000000 0.000000
rsqrt_day_cr 0.4999999981 0 8388609 0 0 0 100.000000 0.000000 0.000000 0.000000
rsqrt_sse_cr 0.4999999981 0 8388609 0 0 0 100.000000 0.000000 0.000000 0.000000

TOTAL:

func e max ULP CR FR 2 ULP > 2 ULP CR% FR% 2 ULP% > 2 ULP%
vrsqrte_f32 -- 4947968 218 476 468 16776056 0.001299 0.002837 0.002789 99.993074
FRSR_Mon0 -- 564177 9 20 17 16777172 0.000054 0.000119 0.000101 99.999726
FRSR_Deg0 -- 403258 11 26 24 16777157 0.000066 0.000155 0.000143 99.999636
FRSR_Mon1 -- 14751 315 593 633 16775677 0.001878 0.003535 0.003773 99.990815
FRSR_Deg1 -- 10215 730 1246 1354 16773888 0.004351 0.007427 0.008070 99.980152
FRSR_Deg1Alt -- 10219 667 1318 1312 16773921 0.003976 0.007856 0.007820 99.980348
rsqrt_sse -- 4980 4608 9188 9211 16754211 0.027466 0.054765 0.054902 99.862868
rsqrt_msh -- 1169 7433 14949 14904 16739932 0.044304 0.089103 0.088835 99.777758
rsqrt_day -- 317 96098 195138 203828 16282154 0.572789 1.163113 1.214909 97.049189
rsqrt_wmc -- 11 858998 1814391 2025100 12078729 5.120026 10.814612 12.070535 71.994827
rsqrt_sse_r2 2.7195733655 3 13024049 3654960 98097 112 77.629372 21.785257 0.584704 0.000668
rsqrt_sse_r1 2.7000876479 3 13219820 3482258 75076 64 78.796258 20.755873 0.447488 0.000381
rsqrt_day_r1 0.8535512891 1 14594385 2182833 0 0 86.989303 13.010697 0.000000 0.000000
f32_rsqrt 0.8499174770 1 14594680 2182538 0 0 86.991061 13.008939 0.000000 0.000000
rsqrt_day_r2 0.5094854385 1 16728260 48958 0 0 99.708188 0.291812 0.000000 0.000000
rsqrt_wmc_r1 0.8518467862 1 14596002 2181216 0 0 86.998941 13.001059 0.000000 0.000000
rsqrt_sse_hm 0.5011967756 1 16775994 1224 0 0 99.992704 0.007296 0.000000 0.000000
rsqrt_day_hm 0.5000192411 1 16777149 69 0 0 99.999589 0.000411 0.000000 0.000000
rsqrt_wmc_r2 0.5000088997 1 16777158 60 0 0 99.999642 0.000358 0.000000 0.000000
rsqrt_wmc_hm 0.5000000857 1 16777215 3 0 0 99.999982 0.000018 0.000000 0.000000
rsqrt_cb 0.4999999981 0 16777218 0 0 0 100.000000 0.000000 0.000000 0.000000
rsqrt_day_cr 0.4999999981 0 16777218 0 0 0 100.000000 0.000000 0.000000 0.000000
rsqrt_sse_cr 0.4999999981 0 16777218 0 0 0 100.000000 0.000000 0.000000 0.000000
// Public Domain under http://unlicense.org, see link for details.
// compile with (or equiv):
// clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno rsqrt.c -o rsqrt -lm
// VALIDATE_CR options additionally requires MPFR: -lmpfr
// This is intended to be a hackable testbed. Only a subset of
// included routines are ran as is (those that seem potentially
// interesting to me). Also note that a number of implementations
// differ from those in the original papers.
//********************************************************
// CONFIG STUFF HERE
// not really exhaustive (add notes). actually I removed this
// because I'm too lazy to check the domains of the inputs.
// (general slightly smaller than all normal inputs). yeah
// this is useful information.
//#define EXHAUSTIVE_TEST
// following options requires: MPFR installed (https://www.mpfr.org/)
// if defined.
// validate correct rounding of double promotion method (for normal
// and denormal input)
//#define VALIDATE_CR
// if defined (with VALIDATE_CR) then tests all (about 10 minutes)
//#define VALIDATE_CR_ALL
// END OF CONFIG
//********************************************************
#if defined(VALIDATE_CR)
#define NEEDS_MPFR
#endif
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <assert.h>
//********************************************************
const float f32_min_normal = 0x1.0p-126f;
const float f32_max_normal = 0x1.fffffep+127f;
static inline uint32_t f32_to_bits(float x)
{
uint32_t u; memcpy(&u,&x,4); return u;
}
inline float f32_from_bits(uint32_t x)
{
float u; memcpy(&u,&x,4); return u;
}
static inline uint64_t f64_to_bits(double x)
{
uint64_t u; memcpy(&u, &x, 8); return u;
}
static inline double f64_from_bits(uint64_t x)
{
double f; memcpy(&f, &x, 8); return f;
}
// JUST SAY NO TO MATH ERRNO
// to cut some of the pain of math errno not being disabled
// (-fno-math-errno). But you really should do that. Unless
// your using visual-c: Then I feel your pain and I can't
// think of anything better to do than this. (and nag
// MS to make a work around)
inline float f32_sqrt(float a)
{
#if defined(__NO_MATH_ERRNO__) || defined(_M_FP_FAST)
return sqrtf(a);
#elif defined(F32_INTEL)
return F32_SSE_WRAP(_mm_sqrt_ss, a);
#elif defined(__ARM_ARCH)
return __sqrtf(a);
#else
// do nothing for warning ATM
#endif
}
//********************************************************
// faithfully rounded.
static inline float f32_rsqrt(float x) { return f32_sqrt(1.f/x); }
// promote/demote double is correctly rounded
static inline float f32_rsqrt_cr(float x)
{
return (float)sqrt(1.0/(double)x);
}
// DEAD
// dep-chain len: 2
// correctly: 74.013561% (12417415)
// faithfully: 25.986439% (4359801)
// uiCA predicted throughput: 23.00
float rsqrt_n1(float x) { return f32_sqrt(x)/x; }
// DEAD
// dep-chain len: 2
// correctly: 73.995733% (12414424)
// faithfully: 26.004267% (4362792)
// uiCA predicted throughput: 23.00
float rsqrt_n2(float x) { return 1.f/f32_sqrt(x); }
// MOSTLY DEAD
// NOTE: if sqrt(x) and 1/x "can" be computed in parallel
// then this becomes potentially interesting. This is
// not the case in current mainstream CPUs.
// dep-chain len: 2
// correctly: 64.843583% (10878948)
// faithfully: 34.837395% (5844745)
// 2 ulp: 0.319022% (53523)
float rsqrt_n3(float x) { return f32_sqrt(x)*(1.f/x); }
//********************************************************
// Newton-Raphson step for 1/sqrt(x) using FMA when
// we haven't computed (or approximated) 1/x
// y = estimated in (y_n)
// h = -x/2
// { don't use see: XX}
inline float f32_rsqrt_nr_step_common(float y, float h)
{
return y*fmaf(h,y*y, 1.5f);
}
// rename
static inline float f32_rsqrt_nr_fma_step(float y, float h)
{
float r = y*y;
float v = fmaf(h,r, 0.5f);
return fmaf(y,v,y);
}
// Newton-Raphson step for 1/sqrt(x) using FMA when
// y = estimate in (y_n)
// h = -x/2
// r = ~1/x
// RENAME AND NO
inline float f32_rsqrt_nr_step_cb(float y, float h, float r)
{
float s = fmaf(h,r,0.5f);
float t = fmaf(y,y, -r);
float v = fmaf(h,t, s);
return fmaf(y,v,y);
}
// Halley method step for 1/sqrt(x)
// y = estimate in (y_n)
// h = -x/2
// r = ~1/x
inline float f32_rsqrt_hm_step(float y, float h, float r)
{
float s = fmaf(r,h,0.5f);
float t = fmaf(y,y,-r);
float v = fmaf(h,t, s);
float w = fmaf(1.5f*v,v,v);
return fmaf(y,w,y);
}
//********************************************************
// expansion helpers
static inline float rsqrt_r1_apply(float (*f)(float), float x)
{
float y = f(x);
return f32_rsqrt_nr_fma_step(y, -0.5f*x);
}
static inline float rsqrt_r2_apply(float (*f)(float), float x)
{
float y = f(x);
return f32_rsqrt_nr_step_cb(y, -0.5f*x, y*y);
}
static inline float rsqrt_hm_apply(float (*f)(float), float x)
{
float y = f(x);
return f32_rsqrt_hm_step(y, -0.5f*x, y*y);
}
//********************************************************
// Portable versions by Carlos F. Borges
// "Fast Compensated Algorithms for the Reciprocal Square Root, the Reciprocal Hypotenuse, and Givens Rotations", Carlos F. Borges, 2021
// https://arxiv.org/abs/2103.08694
// --
// additional analysis:
// "High-level algorithms for correctly-rounded reciprocal square roots", Borges, Jeannerod & Muller, 2022
// https://hal.inria.fr/hal-03728088
// dep-chain len: 5
// correctly: 99.999994% (16777215)
// faithfully: 0.000006% (1)
// Borges Algorithm 2 (rsqrtNewton in Borges/Jeannerod/Muller)
// uiCA: https://bit.ly/3OkNAk5
// predicted throughput: 35.00
float rsqrt_hq(float x)
{
float r = 1.f/x;
float h = -0.5f*x;
x = f32_sqrt(r);
x = f32_rsqrt_nr_step_cb(x,h,r);
return x;
}
// correctly rounded (in round to even/ties to nearest)
// on [0x1p-125, 0x1.fffffcp+127] (almost all normal numbers)
// Borges Algorithm 3 (rsqrtHalley in Borges/Jeannerod/Muller)
// uiCA: https://bit.ly/3AzVQXJ
// predicted throughput: 40.00
float rsqrt_cb(float x)
{
float r = 1.f/x;
float h = -0.5f*x;
x = f32_sqrt(r);
x = f32_rsqrt_hm_step(x,h,r);
return x;
}
//********************************************************
// so-call "bithack" initial steps
static inline float f32_rsqrt_initial(float x, uint32_t K)
{
return f32_from_bits(K - (f32_to_bits(x) >> 1));
}
// Day observes there's a second choice which gives some
// extra wiggle room in optimizing the "magic" K
static inline float f32_rsqrt_initial_md(float x, uint32_t K)
{
return f32_from_bits((K-f32_to_bits(x)) >> 1);
}
//********************************************************
// Mike Day core routines as in paper. Without star is paper
// and contractions are disabled. Starred versions are explict FMAs
// where version choosen uses the greedy measure to minimize
// the counts of >2 ulp results.
// | func |max ULP| CR| FR| 2 ULP| > 2 ULP|
// | ---: | ---:| ---:| ---:| ---:| ---:|
// | FRSR_Mon0 | 564177| 12| 28| 23| 25165761|
// | FRSR_Deg0 | 403258| 11| 26| 24| 25165763|
// | FRSR_Mon1 | 14751| 557| 1070| 1063| 25163134|
// | FRSR_Mon1*| 14751| 545| 1057| 1099| 25163123|
// | FRSR_Deg1 | 10215| 941| 1802| 1939| 25161142|
// | FRSR_Deg1*| 10215| 1051| 1848| 1963| 25160962|
// | FRSR_Deg1Alt | 10219| 874| 1858| 2033| 25161059|
// | FRSR_Deg1Alt*| 10219| 972| 1920| 1901| 25161031|
// | FRSR_Mon2 | 317|105658| 214279| 222937| 24622950|
// | FRSR_Mon2*| 317|105658| 214279| 222937| 24622950|
// listing 2 (FRSR_Mon0)
// max-ulp: 564177
float FRSR_Mon0(float x)
{
return f32_rsqrt_initial(x, 0x5F37642F);
}
// listing 3 (FRSR_Deg0)
// max-ulp: 403258
float FRSR_Deg0(float x)
{
return 0.79247999f * f32_rsqrt_initial_md(x, 0xBEBFFDAA);
}
// listing 4 (FRSR_Mon1)
// max-ulp: 14751
// | CR| FR | 2 ULP| > 2 ULP |
// | 557| 1070| 1063| 25163134| 1
// | 497| 1094| 1097| 25163136| 2
// | 545| 1057| 1099| 25163123| 3
// 3 fma/mul
float FRSR_Mon1(float x)
{
float y = f32_rsqrt_initial_md(x, 0xBE167122);
//return y * (1.8909901f - x*y*y); // 1
//return y * fmaf(-x, y*y, 1.8909901f); // 2
return y * fmaf(-y, x*y, 1.8909901f); // 3
}
// listing 5 (FRSR_Deg1)
// max-ulp: 10215
// | CR| FR | 2 ULP| > 2 ULP |
// | 941| 1802| 1939| 25161142| 1
// | 992| 1907| 1882| 25161043| 2
// | 1051| 1848| 1963| 25160962| 3
// | 976| 1907| 1894| 25161047| 4
// 4 fma/mul
float FRSR_Deg1(float x)
{
static const float c0 = 1.1893165f;
static const float c1 = -0.24889956f;
float y = f32_rsqrt_initial(x, 0x5F5FFF00);
//return y * (c0 + x*y*y*c1); // 1
//return y * fmaf(x, y*y*c1, c0); // 2
return y * fmaf(x*y, y*c1, c0); // 3
//return y * fmaf(x*c1,y*y, c0); // 4
}
// listing 6 (FRSR_Deg1Alt)
// max-ulp: 10219
// | CR| FR | 2 ULP| > 2 ULP |
// | 874| 1858| 2033| 25161059| 1
// | 972| 1920| 1901| 25161031| 2
// | 950| 1991| 1908| 25160975| 3
// | 972| 1920| 1901| 25161031| 4
// 4 fma/mul
float FRSR_Deg1Alt(float x)
{
static const float c0 = 1.1891762f;
static const float c1 = -0.24881148f;
float y = f32_rsqrt_initial(x, 0x5F6004CC);
//return y * (c0 + y*y*x*c1); // 1
//return y * fmaf(y*y, x*c1, c0); // 2
//return y * fmaf(x*y, y*c1, c0); // 3
return y * fmaf(x*c1,y*y, c0); // 4
}
// listing 7 (FRSR_Mon2)
// max-ulp: 317
// | CR| FR | 2 ULP| > 2 ULP |
// | 105658| 214279| 222937| 24622950| 1
// | 105658| 214279| 222937| 24622950| 2
// | 106447| 213978| 221946| 24623453| 3
// 4 fma/mul, 1 add
float rsqrt_day(float x)
{
static const float c0 = 2.2825186f;
static const float c1 = -2.253305f;
float y = f32_rsqrt_initial(x, 0x5F11107D);
float z = x*y*y;
//return y * (c0 + z *(z+c1)); // 1
return y * fmaf(z, z+c1, c0); // 2
//return y * fmaf(z,z,fmaf(z,c1,c0)); // 3
}
// max-ulp: 2
// correctly: 70.604507% (17768206)
// faithfully: 29.345020% (7384916)
// 2 ulp: 0.050473% (12702)
// 8 fma/mul, 1 add
float rsqrt_day_r1(float x) { return rsqrt_r1_apply(&rsqrt_day, x); }
// max-ulp: 1
// correctly: 99.667398% (25082122)
// faithfully: 0.332602% (83702)
// 10 fma/mul, 1 add
float rsqrt_day_r2(float x) { return rsqrt_r2_apply(&rsqrt_day, x); }
// max-ulp: 1
// correctly: 99.999559% (25165713)
// faithfully: 0.000441% (111)
// 12 fma/mul, 1 add
float rsqrt_day_hm(float x) { return rsqrt_hm_apply(&rsqrt_day, x); }
float rsqrt_day_cr(float x)
{
float y = rsqrt_day(x);
float h = -0.5f*x;
y = f32_rsqrt_hm_step(y, h, y*y);
//y = f32_rsqrt_nr_fma_step(y, -0.5f*x); // no good with this step
y = f32_rsqrt_nr_step_cb(y, h, y*y); // need this
return y;
}
//*********************************************************************
// Walczyk, Moroz, Cieslinski, Cezary:
// "InvSqrt2" - modified to use FMA
// no constant refinement
// max-ulp: 11
// correctly: 5.120015% (858996)
// faithfully: 11.176844% (1814391)
// 2 ulp: 12.521724% (3151195)
// >2 ulp: 71.061440% (17883197)
// 7 fma/mul
float rsqrt_wmc(float x)
{
float h = -0.5f*x;
x = f32_rsqrt_initial(x, 0x5F376908);
x = x*fmaf(h,x*x, 1.5008789f);
x = x*fmaf(h,x*x, 1.5000006f);
return x;
}
float rsqrt_wmc_r1(float x) { return rsqrt_r1_apply(&rsqrt_wmc, x); }
float rsqrt_wmc_r2(float x) { return rsqrt_r2_apply(&rsqrt_wmc, x); }
float rsqrt_wmc_hm(float x) { return rsqrt_hm_apply(&rsqrt_wmc, x); }
//*********************************************************************
// "Modified Fast Inverse Square Root and Square Root Approximation
// Algorithms: The Method of Switching Magic Constants"
// Moroz, Volodymyr, Samotyy and Horyachyy, 2021
float rsqrt_msh(float x)
{
static const uint32_t R[2] = {0x5f19e8b7, 0x5ed9e8b7};
static const float A[2] = {2.1499474f, 1.0749737f};
static const float M[2] = {0.824218631f, 2.33124256f};
uint32_t id = (f32_to_bits(x) & 0x00800000) == 0 ? 0 : 1;
float y;
y = f32_rsqrt_initial(x, R[id]);
y = M[id]*y*fmaf(-x, y*y, A[id]);
return y;
}
float rsqrt_msh_r1(float x) { return rsqrt_r1_apply(&rsqrt_msh, x); }
float rsqrt_msh_r2(float x) { return rsqrt_r2_apply(&rsqrt_msh, x); }
float rsqrt_msh_hm(float x) { return rsqrt_hm_apply(&rsqrt_msh, x); }
//*********************************************************************
// original SSE approximation
#include <x86intrin.h>
// return hardware 1/sqrt(x) approx
static inline float rsqrt_sse(float x)
{
return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));
}
// 4 fma/mul
float rsqrt_sse_r1(float x)
{
float y = rsqrt_sse(x);
return f32_rsqrt_nr_fma_step(y, -0.5f*x);
}
// 8 fma/mul
float rsqrt_sse_r2(float x)
{
float y = rsqrt_sse(x);
return f32_rsqrt_nr_step_cb(y, -0.5f*x, y*y);
}
// 8 fma/mul
float rsqrt_sse_hm(float x)
{
float y = rsqrt_sse(x);
return f32_rsqrt_hm_step(y, -0.5f*x, y*y);
}
// 13 fma/mul
float rsqrt_sse_cr(float x)
{
float y = rsqrt_sse(x);
float h = -0.5f*x;
y = f32_rsqrt_hm_step(y, h, y*y);
y = f32_rsqrt_nr_step_cb(y, h, y*y);
return y;
}
//*********************************************************************
// Use hardware approximation: neon
// emulate neon approximation ops (std & not increased precision ones)
// ignore all special cases (including 0): don't
// need them for local testing. written to be eyeball-able to spec
float vrecpe_f32(float x)
{
// extract the parts
uint32_t ix = f32_to_bits(x);
uint32_t sx = ((ix >> 31) & 0x01);
uint32_t ex = ((ix >> 23) & 0xff);
uint32_t mx = ((ix ) & 0x7fffff);
uint32_t t;
mx = 0x100|(mx >> 15); // add implied bit and top 8 explict
ex = 253-ex; // adjust the exp
// SEE: ARM Spec - FPRecipEstimate
t = (mx<<1)+1;
t = (((1<<19)/t)+1)>>1;
mx = (t & 0xff) << 15;
// concat the parts back together
ix = (sx & 0x01) << 31;
ix |= (ex & 0xff) << 23;
ix |= (mx & 0x7fffff);
return f32_from_bits(ix);
}
// max-ulp: 4947968
// correctly: 0.001293% (217)
// faithfully: 0.002837% (476)
// written to be eyeball-able to spec
float vrsqrte_f32(float x)
{
// yeah: lots of reductions are possible & don't care
uint32_t ix = f32_to_bits(x);
uint32_t ex = ((ix >> 23) & 0xff);
uint32_t mx = ((ix ) & 0x7fffff);
uint32_t s = (mx & 1);
mx >>= (15+s);
mx |= 1 << (8-s);
if (mx < 256)
mx = mx * 2 + 1;
else
mx = (mx|1) << 1;
uint32_t t = 512;
while (mx * (t+1)*(t+1) < (1<<28)) { t++; }
mx = (t+1)/2;
ex = (380-ex)/2;
ix = (ex & 0xff) << 23;
ix |= (mx & 0xff) << 15;
return f32_from_bits(ix);
}
float vrsqrts_f32(float a, float b)
{
return -0.5f*fmaf(a,b,-3.f);
}
float vrecps_f32(float a, float b)
{
return -fmaf(a,b,-2.f);
}
//********************************************************
// start of testing related stuff
//********************************************************
static inline uint32_t u32_abs(uint32_t x)
{
return (int32_t)x >= 0 ? x : -x;
}
// ulp distance provided a & b are finite
// and have the same sign
static inline uint32_t f32_ulp_dist_ss(float a, float b)
{
uint32_t ua = f32_to_bits(a);
uint32_t ub = f32_to_bits(b);
return u32_abs(ua-ub);
}
static inline uint64_t u64_abs(uint64_t x)
{
return (int64_t)x >= 0 ? x : -x;
}
// ulp distance provided a & b are finite
// and have the same sign
static inline uint64_t f64_ulp_dist_ss(double a, double b)
{
uint64_t ua = f64_to_bits(a);
uint64_t ub = f64_to_bits(b);
return u64_abs(ua-ub);
}
typedef struct {
float (*f)(float);
char* name;
char* notes;
uint32_t flags;
uint32_t type;
} func_entry_t;
#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))
#define STRINGIFY(S) STRINGIFY_(S)
#define STRINGIFY_(S) #S
#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }
func_entry_t func_table[] =
{
#if 1
ENTRY(vrsqrte_f32),
ENTRY(FRSR_Mon0),
ENTRY(FRSR_Deg0),
ENTRY(FRSR_Mon1),
ENTRY(FRSR_Deg1),
ENTRY(FRSR_Deg1Alt),
ENTRY(rsqrt_sse),
ENTRY(rsqrt_msh),
ENTRY(rsqrt_day),
ENTRY(rsqrt_wmc),
ENTRY(rsqrt_sse_r2),
ENTRY(rsqrt_sse_r1),
ENTRY(rsqrt_day_r1),
ENTRY(f32_rsqrt),
ENTRY(rsqrt_day_r2),
ENTRY(rsqrt_wmc_r1),
ENTRY(rsqrt_sse_hm),
ENTRY(rsqrt_day_hm),
ENTRY(rsqrt_wmc_r2),
ENTRY(rsqrt_wmc_hm),
//ENTRY(f32_rsqrt_bar),
// find ranges of these
ENTRY(rsqrt_cb),
ENTRY(rsqrt_day_cr),
ENTRY(rsqrt_sse_cr),
#else
// hack for testing one
#endif
};
typedef struct {
double abs;
uint32_t max;
uint32_t ulp[4];
} func_error_t;
// tracks total error data across multiple intervals
func_error_t func_error[LENGTHOF(func_table)] = {{0}};
// add error data from current interval to the totals
void error_to_totals(func_error_t* e)
{
for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
func_error[fi].ulp[0] += e->ulp[0];
func_error[fi].ulp[1] += e->ulp[1];
func_error[fi].ulp[2] += e->ulp[2];
func_error[fi].ulp[3] += e->ulp[3];
if (e->max > func_error[fi].max) func_error[fi].max = e->max;
if (e->abs > func_error[fi].abs) func_error[fi].abs = e->abs;
}
}
void error_dump_i(func_error_t* e)
{
printf("|%15s|%12s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|\n",
"func", "e", "max ULP", "CR", "FR", "2 ULP", "> 2 ULP",
"CR%", "FR%", "2 ULP%","> 2 ULP%");
printf("|%15s|%12s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|\n",
"---:", ":---:", "---:", "---:", "---:", "---:",
"---:", "---:", "---:","---:", "---:");
for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
uint32_t u0 = e->ulp[0];
uint32_t u1 = e->ulp[1];
uint32_t u2 = e->ulp[2];
uint32_t u3 = e->ulp[3];
uint32_t t = (u0+u1+u2+u3);
double s = 100.0/(double)t;
if (e->abs < 9.0)
printf("|%15s|%12.10f|%10u|%10u|%10u|%10u|%10u|%10f|%10f|%10f|%10f|\n",
func_table[fi].name, e->abs, e->max,
u0, u1, u2, u3,
s*u0, s*u1, s*u2, s*u3
);
else
printf("|%15s| -- |%10u|%10u|%10u|%10u|%10u|%10f|%10f|%10f|%10f|\n",
func_table[fi].name, e->max,
u0, u1, u2, u3,
s*u0, s*u1, s*u2, s*u3
);
}
}
void error_dump(void)
{
printf("\nTOTAL:\n");
error_dump_i(func_error);
}
void test_range(uint32_t x0, uint32_t x1)
{
float f0 = f32_from_bits(x0);
float f1 = f32_from_bits(x1);
printf("\nchecking on [%08x,%08x] [%e,%e]\n", x0,x1,f0,f1);
func_error_t error[LENGTHOF(func_table)] = {{0}};
for(uint32_t ix=x0; ix<=x1; ix++) {
float x = f32_from_bits(ix);
double d = sqrt(1.0/(double)x);
float cr = (float)d;
for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
float r = func_table[fi].f(x);
double e = fabs(d-(double)r) * 0x1.0p24;
//if (e == 0.f) { error[fi].ulp[0]++; continue; };
uint32_t ulp = f32_ulp_dist_ss(cr,r);
if (ulp > error[fi].max) { error[fi].max = ulp; }
if (e > error[fi].abs) { error[fi].abs = e; }
if (ulp > 3) ulp = 3;
error[fi].ulp[ulp]++;
}
}
error_to_totals(error);
error_dump_i(error);
}
//********************************************************
// validating double promotion method is correctly rounded
#if defined(NEEDS_MPFR)
#include <mpfr.h>
#if defined(VALIDATE_CR)
// worker for vaidate_double_promote
void validate_cr_i(float s, float e)
{
mpfr_t mx,mr;
uint32_t xs = f32_to_bits(s);
uint32_t xe = f32_to_bits(e);
mpfr_init2(mx, 24);
mpfr_init2(mr, 24);
for(uint32_t xi=xs; xi<=xe; xi++) {
float x = f32_from_bits(xi);
// compute correctly rounded
mpfr_set_flt(mx, x, MPFR_RNDN);
mpfr_rec_sqrt(mr, mx, MPFR_RNDN);
float cr = mpfr_get_flt(mr, MPFR_RNDN);
float r = f32_rsqrt_cr(x);
if (r == cr) continue;
printf(" ERROR: not correctly rounded at: x=%a %a got %a\n", x, cr, r);
xe = xi-1;
break;
}
mpfr_clear(mr);
mpfr_clear(mx);
}
// double promotion is correctly rounded for all normal
// inputs.
void vaidate_double_promote(void)
{
#if defined(VALIDATE_CR_ALL)
printf("starting: validate f32_sqrt_cr: should take around 10 mins and I'm not going to give you any visual feedback of progress.\n");
validate_cr_i(0, f32_max_normal);
#else
printf("starting: validate f32_sqrt_cr: (fast/non-exhastive)\n");
printf(" denormals\n");
validate_cr_i(0, f32_min_normal);
printf(" bottom of normal range\n");
validate_cr_i(f32_min_normal, f32_min_normal*4.f);
printf(" top of normal range\n");
validate_cr_i(f32_max_normal/4.f, f32_max_normal);
printf(" around 1.0\n");
validate_cr_i(1.f/4.f, 4.f);
#endif
}
#endif
//********************************************************
#endif
int main(void)
{
#if defined(VALIDATE_CR)
vaidate_double_promote();
#endif
printf("<details markdown=\"1\"><summary>click for range breakdown</summary>\n\n");
#if defined(EXHAUSTIVE_TEST)
printf("fill-in again\n");
#else
test_range(f32_to_bits(1.f),f32_to_bits(2.f));
test_range(f32_to_bits(2.f),f32_to_bits(4.f));
#endif
printf("\n</details>\n\n");
error_dump();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment