Marc-B-Reynolds/output.md

## output.md

      
    Raw
  

              output.md
            
          
    click for range breakdown
checking on [3f800000,40000000] [1.000000e+00,2.000000e+00]


func
e
max ULP
CR
FR
2 ULP
> 2 ULP
CR%
FR%
2 ULP%
> 2 ULP%


vrsqrte_f32
--
4947968
103
225
216
8388065
0.001228
0.002682
0.002575
99.993515


FRSR_Mon0
--
564177
3
8
6
8388592
0.000036
0.000095
0.000072
99.999797


FRSR_Deg0
--
403258
0
0
0
8388609
0.000000
0.000000
0.000000
100.000000


FRSR_Mon1
--
14751
230
464
466
8387449
0.002742
0.005531
0.005555
99.986172


FRSR_Deg1
--
10215
321
602
609
8387077
0.003827
0.007176
0.007260
99.981737


FRSR_Deg1Alt
--
10219
305
602
589
8387113
0.003636
0.007176
0.007021
99.982166


rsqrt_sse
--
4980
1916
3792
3835
8379066
0.022840
0.045204
0.045717
99.886239


rsqrt_msh
--
1169
3067
6192
6203
8373147
0.036561
0.073814
0.073946
99.815679


rsqrt_day
--
317
9560
19141
19109
8340799
0.113964
0.228178
0.227797
99.430060


rsqrt_wmc
--
11
459691
998354
1126095
5804469
5.479943
11.901306
13.424097
69.194654


rsqrt_sse_r2
2.7195733655
3
6342724
1974273
71500
112
75.611153
23.535165
0.852346
0.001335


rsqrt_sse_r1
2.7000876479
3
6502216
1829513
56816
64
77.512446
21.809492
0.677299
0.000763


rsqrt_day_r1
0.8535512891
1
7109620
1278989
0
0
84.753265
15.246735
0.000000
0.000000


f32_rsqrt
0.8499174770
1
7109638
1278971
0
0
84.753479
15.246521
0.000000
0.000000


rsqrt_day_r2
0.5094854385
1
8353865
34744
0
0
99.585819
0.414181
0.000000
0.000000


rsqrt_wmc_r1
0.8518467862
1
7110734
1277875
0
0
84.766545
15.233455
0.000000
0.000000


rsqrt_sse_hm
0.5011967756
1
8387934
675
0
0
99.991953
0.008047
0.000000
0.000000


rsqrt_day_hm
0.5000192411
1
8388567
42
0
0
99.999499
0.000501
0.000000
0.000000


rsqrt_wmc_r2
0.5000088997
1
8388575
34
0
0
99.999595
0.000405
0.000000
0.000000


rsqrt_wmc_hm
0.5000000857
1
8388608
1
0
0
99.999988
0.000012
0.000000
0.000000


rsqrt_cb
0.4999999218
0
8388609
0
0
0
100.000000
0.000000
0.000000
0.000000


rsqrt_day_cr
0.4999999218
0
8388609
0
0
0
100.000000
0.000000
0.000000
0.000000


rsqrt_sse_cr
0.4999999218
0
8388609
0
0
0
100.000000
0.000000
0.000000
0.000000


checking on [40000000,40800000] [2.000000e+00,4.000000e+00]


func
e
max ULP
CR
FR
2 ULP
> 2 ULP
CR%
FR%
2 ULP%
> 2 ULP%


vrsqrte_f32
--
4947968
115
251
252
8387991
0.001371
0.002992
0.003004
99.992633


FRSR_Mon0
--
564177
6
12
11
8388580
0.000072
0.000143
0.000131
99.999654


FRSR_Deg0
--
286518
11
26
24
8388548
0.000131
0.000310
0.000286
99.999273


FRSR_Mon1
--
9631
85
129
167
8388228
0.001013
0.001538
0.001991
99.995458


FRSR_Deg1
--
7339
409
644
745
8386811
0.004876
0.007677
0.008881
99.978566


FRSR_Deg1Alt
--
7338
362
716
723
8386808
0.004315
0.008535
0.008619
99.978530


rsqrt_sse
--
4096
2692
5396
5376
8375145
0.032091
0.064325
0.064087
99.839497


rsqrt_msh
--
827
4366
8757
8701
8366785
0.052047
0.104392
0.103724
99.739838


rsqrt_day
--
213
86538
175997
184719
7941355
1.031613
2.098047
2.202022
94.668317


rsqrt_wmc
8.0949323997
8
399307
816037
899005
6274260
4.760110
9.727918
10.716973
74.794999


rsqrt_sse_r2
2.3655200340
2
6681325
1680687
26597
0
79.647591
20.035348
0.317061
0.000000


rsqrt_sse_r1
2.3163667079
2
6717604
1652745
18260
0
80.080070
19.702253
0.217676
0.000000


rsqrt_day_r1
0.7492158674
1
7484765
903844
0
0
89.225341
10.774659
0.000000
0.000000


f32_rsqrt
0.7497656401
1
7485042
903567
0
0
89.228643
10.771357
0.000000
0.000000


rsqrt_day_r2
0.5062385015
1
8374395
14214
0
0
99.830556
0.169444
0.000000
0.000000


rsqrt_wmc_r1
0.7494520154
1
7485268
903341
0
0
89.231337
10.768663
0.000000
0.000000


rsqrt_sse_hm
0.5008518882
1
8388060
549
0
0
99.993455
0.006545
0.000000
0.000000


rsqrt_day_hm
0.5000102632
1
8388582
27
0
0
99.999678
0.000322
0.000000
0.000000


rsqrt_wmc_r2
0.5000051744
1
8388583
26
0
0
99.999690
0.000310
0.000000
0.000000


rsqrt_wmc_hm
0.5000000447
1
8388607
2
0
0
99.999976
0.000024
0.000000
0.000000


rsqrt_cb
0.4999999981
0
8388609
0
0
0
100.000000
0.000000
0.000000
0.000000


rsqrt_day_cr
0.4999999981
0
8388609
0
0
0
100.000000
0.000000
0.000000
0.000000


rsqrt_sse_cr
0.4999999981
0
8388609
0
0
0
100.000000
0.000000
0.000000
0.000000


TOTAL:


func
e
max ULP
CR
FR
2 ULP
> 2 ULP
CR%
FR%
2 ULP%
> 2 ULP%


vrsqrte_f32
--
4947968
218
476
468
16776056
0.001299
0.002837
0.002789
99.993074


FRSR_Mon0
--
564177
9
20
17
16777172
0.000054
0.000119
0.000101
99.999726


FRSR_Deg0
--
403258
11
26
24
16777157
0.000066
0.000155
0.000143
99.999636


FRSR_Mon1
--
14751
315
593
633
16775677
0.001878
0.003535
0.003773
99.990815


FRSR_Deg1
--
10215
730
1246
1354
16773888
0.004351
0.007427
0.008070
99.980152


FRSR_Deg1Alt
--
10219
667
1318
1312
16773921
0.003976
0.007856
0.007820
99.980348


rsqrt_sse
--
4980
4608
9188
9211
16754211
0.027466
0.054765
0.054902
99.862868


rsqrt_msh
--
1169
7433
14949
14904
16739932
0.044304
0.089103
0.088835
99.777758


rsqrt_day
--
317
96098
195138
203828
16282154
0.572789
1.163113
1.214909
97.049189


rsqrt_wmc
--
11
858998
1814391
2025100
12078729
5.120026
10.814612
12.070535
71.994827


rsqrt_sse_r2
2.7195733655
3
13024049
3654960
98097
112
77.629372
21.785257
0.584704
0.000668


rsqrt_sse_r1
2.7000876479
3
13219820
3482258
75076
64
78.796258
20.755873
0.447488
0.000381


rsqrt_day_r1
0.8535512891
1
14594385
2182833
0
0
86.989303
13.010697
0.000000
0.000000


f32_rsqrt
0.8499174770
1
14594680
2182538
0
0
86.991061
13.008939
0.000000
0.000000


rsqrt_day_r2
0.5094854385
1
16728260
48958
0
0
99.708188
0.291812
0.000000
0.000000


rsqrt_wmc_r1
0.8518467862
1
14596002
2181216
0
0
86.998941
13.001059
0.000000
0.000000


rsqrt_sse_hm
0.5011967756
1
16775994
1224
0
0
99.992704
0.007296
0.000000
0.000000


rsqrt_day_hm
0.5000192411
1
16777149
69
0
0
99.999589
0.000411
0.000000
0.000000


rsqrt_wmc_r2
0.5000088997
1
16777158
60
0
0
99.999642
0.000358
0.000000
0.000000


rsqrt_wmc_hm
0.5000000857
1
16777215
3
0
0
99.999982
0.000018
0.000000
0.000000


rsqrt_cb
0.4999999981
0
16777218
0
0
0
100.000000
0.000000
0.000000
0.000000


rsqrt_day_cr
0.4999999981
0
16777218
0
0
0
100.000000
0.000000
0.000000
0.000000


rsqrt_sse_cr
0.4999999981
0
16777218
0
0
0
100.000000
0.000000
0.000000
0.000000


## rsqrt.c
// Public Domain under http://unlicense.org, see link for details.

// compile with (or equiv):
// clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno rsqrt.c -o rsqrt -lm

// VALIDATE_CR options additionally requires MPFR: -lmpfr

// This is intended to be a hackable testbed. Only a subset of
// included routines are ran as is (those that seem potentially
// interesting to me). Also note that a number of implementations
// differ from those in the original papers.


//********************************************************
// CONFIG STUFF HERE

// not really exhaustive (add notes). actually I removed this
// because I'm too lazy to check the domains of the inputs.
// (general slightly smaller than all normal inputs). yeah
// this is useful information.
//#define EXHAUSTIVE_TEST

// following options requires: MPFR installed (https://www.mpfr.org/)
// if defined.

// validate correct rounding of double promotion method (for normal
// and denormal input)
//#define VALIDATE_CR

// if defined (with VALIDATE_CR) then tests all (about 10 minutes)
//#define VALIDATE_CR_ALL

// END OF CONFIG
//********************************************************


#if defined(VALIDATE_CR)
#define NEEDS_MPFR
#endif

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <assert.h>


//********************************************************

const float f32_min_normal   = 0x1.0p-126f;
const float f32_max_normal   = 0x1.fffffep+127f;

static inline uint32_t f32_to_bits(float x)
{
  uint32_t u; memcpy(&u,&x,4); return u;
}

inline float f32_from_bits(uint32_t x)
{
  float u; memcpy(&u,&x,4); return u;
}

static inline uint64_t f64_to_bits(double x)
{
  uint64_t u; memcpy(&u, &x, 8); return u;
}

static inline double f64_from_bits(uint64_t x)
{
  double f; memcpy(&f, &x, 8); return f;
}

// JUST SAY NO TO MATH ERRNO

// to cut some of the pain of math errno not being disabled
// (-fno-math-errno). But you really should do that. Unless
// your using visual-c: Then I feel your pain and I can't
// think of anything better to do than this. (and nag
// MS to make a work around)
inline float f32_sqrt(float a)
{
#if defined(__NO_MATH_ERRNO__) || defined(_M_FP_FAST)
  return sqrtf(a);
#elif defined(F32_INTEL)
  return F32_SSE_WRAP(_mm_sqrt_ss, a);
#elif defined(__ARM_ARCH)
  return __sqrtf(a);
#else
  // do nothing for warning ATM
#endif
}

//********************************************************

// faithfully rounded.
static inline float f32_rsqrt(float x) { return f32_sqrt(1.f/x); }

// promote/demote double is correctly rounded
static inline float f32_rsqrt_cr(float x)
{
  return (float)sqrt(1.0/(double)x);
}

// DEAD
// dep-chain len: 2
// correctly:   74.013561% (12417415)
// faithfully:  25.986439% (4359801)
// uiCA predicted throughput: 23.00
float rsqrt_n1(float x) { return f32_sqrt(x)/x; }

// DEAD
// dep-chain len: 2
// correctly:   73.995733% (12414424)
// faithfully:  26.004267% (4362792)
// uiCA predicted throughput: 23.00
float rsqrt_n2(float x) { return 1.f/f32_sqrt(x); }

// MOSTLY DEAD
// NOTE: if sqrt(x) and 1/x "can" be computed in parallel
// then this becomes potentially interesting. This is
// not the case in current mainstream CPUs.

// dep-chain len: 2
// correctly:   64.843583% (10878948)
// faithfully:  34.837395% (5844745)
// 2 ulp:        0.319022% (53523)
float rsqrt_n3(float x) { return f32_sqrt(x)*(1.f/x); }

//********************************************************

// Newton-Raphson step for 1/sqrt(x) using FMA when
// we haven't computed (or approximated) 1/x
//   y = estimated in (y_n)
//   h = -x/2
// { don't use see: XX}
inline float f32_rsqrt_nr_step_common(float y, float h)
{
  return y*fmaf(h,y*y, 1.5f);
}

// rename
static inline float f32_rsqrt_nr_fma_step(float y, float h)
{
  float r = y*y;
  float v = fmaf(h,r, 0.5f);
  return fmaf(y,v,y);
}

// Newton-Raphson step for 1/sqrt(x) using FMA when
//   y = estimate in (y_n)
//   h = -x/2
//   r = ~1/x
// RENAME AND NO
inline float f32_rsqrt_nr_step_cb(float y, float h, float r)
{
  float s = fmaf(h,r,0.5f);
  float t = fmaf(y,y, -r);
  float v = fmaf(h,t,  s);
  return fmaf(y,v,y);
}

// Halley method step for 1/sqrt(x)
//   y = estimate in (y_n)
//   h = -x/2
//   r = ~1/x
inline float f32_rsqrt_hm_step(float y, float h, float r)
{
  float s = fmaf(r,h,0.5f);
  float t = fmaf(y,y,-r);
  float v = fmaf(h,t, s);
  float w = fmaf(1.5f*v,v,v);
  return fmaf(y,w,y);
}

//********************************************************
// expansion helpers

static inline float rsqrt_r1_apply(float (*f)(float), float x)
{
  float y = f(x);
  return f32_rsqrt_nr_fma_step(y, -0.5f*x);
}

static inline float rsqrt_r2_apply(float (*f)(float), float x)
{
  float y = f(x);
  return f32_rsqrt_nr_step_cb(y, -0.5f*x, y*y);
}

static inline float rsqrt_hm_apply(float (*f)(float), float x)
{
  float y = f(x);
  return f32_rsqrt_hm_step(y, -0.5f*x, y*y);
}

//********************************************************

// Portable versions by Carlos F. Borges

// "Fast Compensated Algorithms for the Reciprocal Square Root, the Reciprocal Hypotenuse, and Givens Rotations", Carlos F. Borges, 2021
// https://arxiv.org/abs/2103.08694
// --
// additional analysis:
// "High-level algorithms for correctly-rounded reciprocal square roots", Borges, Jeannerod & Muller, 2022
// https://hal.inria.fr/hal-03728088

// dep-chain len: 5
// correctly:   99.999994% (16777215)
// faithfully:   0.000006% (1)
// Borges Algorithm 2 (rsqrtNewton in Borges/Jeannerod/Muller)
// uiCA: https://bit.ly/3OkNAk5
//   predicted throughput: 35.00
float rsqrt_hq(float x)
{
  float r = 1.f/x;
  float h = -0.5f*x;

  x = f32_sqrt(r);
  x = f32_rsqrt_nr_step_cb(x,h,r);

  return x;
}

// correctly rounded (in round to even/ties to nearest)
// on [0x1p-125, 0x1.fffffcp+127] (almost all normal numbers)
// Borges Algorithm 3 (rsqrtHalley in Borges/Jeannerod/Muller)
// uiCA: https://bit.ly/3AzVQXJ
//   predicted throughput: 40.00
float rsqrt_cb(float x)
{
  float r = 1.f/x;
  float h = -0.5f*x;

  x = f32_sqrt(r);
  x = f32_rsqrt_hm_step(x,h,r);

  return x;
}


//********************************************************
// so-call "bithack" initial steps

static inline float f32_rsqrt_initial(float x, uint32_t K)
{
  return f32_from_bits(K - (f32_to_bits(x) >> 1));
}

// Day observes there's a second choice which gives some
// extra wiggle room in optimizing the "magic" K
static inline float f32_rsqrt_initial_md(float x, uint32_t K)
{
  return f32_from_bits((K-f32_to_bits(x)) >> 1);
}

//********************************************************

// Mike Day core routines as in paper. Without star is paper
// and contractions are disabled. Starred versions are explict FMAs
// where version choosen uses the greedy measure to minimize
// the counts of >2 ulp results.
// |         func |max ULP|    CR|     FR|  2 ULP|  > 2 ULP|
// |         ---: |   ---:|  ---:|   ---:|   ---:|     ---:|
// |    FRSR_Mon0 | 564177|    12|     28|     23| 25165761|
// |    FRSR_Deg0 | 403258|    11|     26|     24| 25165763|
// |    FRSR_Mon1 |  14751|   557|   1070|   1063| 25163134|
// |    FRSR_Mon1*|  14751|   545|   1057|   1099| 25163123|
// |    FRSR_Deg1 |  10215|   941|   1802|   1939| 25161142|
// |    FRSR_Deg1*|  10215|  1051|   1848|   1963| 25160962|
// | FRSR_Deg1Alt |  10219|   874|   1858|   2033| 25161059|
// | FRSR_Deg1Alt*|  10219|   972|   1920|   1901| 25161031|
// |    FRSR_Mon2 |    317|105658| 214279| 222937| 24622950|
// |    FRSR_Mon2*|    317|105658| 214279| 222937| 24622950|

// listing 2 (FRSR_Mon0)
// max-ulp: 564177
float FRSR_Mon0(float x)
{
  return f32_rsqrt_initial(x, 0x5F37642F);
}

// listing 3 (FRSR_Deg0)
// max-ulp: 403258
float FRSR_Deg0(float x)
{
  return 0.79247999f * f32_rsqrt_initial_md(x, 0xBEBFFDAA);
}

// listing 4 (FRSR_Mon1)
// max-ulp: 14751
// |     CR|    FR |  2 ULP| > 2 ULP |
// |    557|   1070|   1063| 25163134| 1
// |    497|   1094|   1097| 25163136| 2
// |    545|   1057|   1099| 25163123| 3
// 3 fma/mul
float FRSR_Mon1(float x)
{
  float y = f32_rsqrt_initial_md(x, 0xBE167122);

//return y * (1.8909901f - x*y*y);       // 1
//return y * fmaf(-x, y*y, 1.8909901f);  // 2
  return y * fmaf(-y, x*y, 1.8909901f);  // 3
}

// listing 5 (FRSR_Deg1)
// max-ulp: 10215
// |     CR|    FR |  2 ULP| > 2 ULP |
// |    941|   1802|   1939| 25161142| 1
// |    992|   1907|   1882| 25161043| 2
// |   1051|   1848|   1963| 25160962| 3
// |    976|   1907|   1894| 25161047| 4
// 4 fma/mul
float FRSR_Deg1(float x)
{
  static const float c0 =  1.1893165f;
  static const float c1 = -0.24889956f;

  float y = f32_rsqrt_initial(x, 0x5F5FFF00);

//return y * (c0 + x*y*y*c1);            // 1
//return y * fmaf(x, y*y*c1, c0);        // 2
  return y * fmaf(x*y, y*c1, c0);        // 3
//return y * fmaf(x*c1,y*y,  c0);        // 4
}

// listing 6 (FRSR_Deg1Alt)
// max-ulp: 10219
// |     CR|    FR |  2 ULP| > 2 ULP |
// |    874|   1858|   2033| 25161059| 1
// |    972|   1920|   1901| 25161031| 2
// |    950|   1991|   1908| 25160975| 3
// |    972|   1920|   1901| 25161031| 4
// 4 fma/mul
float FRSR_Deg1Alt(float x)
{
  static const float c0 =  1.1891762f;
  static const float c1 = -0.24881148f;

  float y = f32_rsqrt_initial(x, 0x5F6004CC);

//return y * (c0 + y*y*x*c1);            // 1
//return y * fmaf(y*y, x*c1, c0);        // 2
//return y * fmaf(x*y, y*c1, c0);        // 3
  return y * fmaf(x*c1,y*y,  c0);        // 4
}

// listing 7 (FRSR_Mon2)
// max-ulp: 317
// |     CR|    FR |  2 ULP| > 2 ULP |
// | 105658| 214279| 222937| 24622950| 1
// | 105658| 214279| 222937| 24622950| 2
// | 106447| 213978| 221946| 24623453| 3
// 4 fma/mul, 1 add
float rsqrt_day(float x)
{
  static const float c0 =  2.2825186f;
  static const float c1 = -2.253305f;

  float y = f32_rsqrt_initial(x, 0x5F11107D);
  float z = x*y*y;

//return y * (c0 + z *(z+c1));           // 1
  return y * fmaf(z, z+c1, c0);          // 2
//return y * fmaf(z,z,fmaf(z,c1,c0));    // 3
}

// max-ulp:       2
// correctly:    70.604507% (17768206)
// faithfully:   29.345020% (7384916)
// 2 ulp:         0.050473% (12702)
// 8 fma/mul, 1 add
float rsqrt_day_r1(float x) { return rsqrt_r1_apply(&rsqrt_day, x); }

// max-ulp:      1
// correctly:   99.667398% (25082122)
// faithfully:   0.332602% (83702)
// 10 fma/mul, 1 add
float rsqrt_day_r2(float x) { return rsqrt_r2_apply(&rsqrt_day, x); }

// max-ulp:      1
// correctly:   99.999559% (25165713)
// faithfully:   0.000441% (111)
// 12 fma/mul, 1 add
float rsqrt_day_hm(float x) { return rsqrt_hm_apply(&rsqrt_day, x); }

float rsqrt_day_cr(float x)
{
  float y = rsqrt_day(x);
  float h = -0.5f*x;
  y = f32_rsqrt_hm_step(y, h, y*y);
//y = f32_rsqrt_nr_fma_step(y, -0.5f*x);  // no good with this step
  y = f32_rsqrt_nr_step_cb(y, h, y*y);    // need this
  return y;
}

//*********************************************************************

// Walczyk, Moroz, Cieslinski, Cezary:
// "InvSqrt2" - modified to use FMA
//    no constant refinement
//  max-ulp:      11
//  correctly:    5.120015% (858996)
//  faithfully:  11.176844% (1814391)
//   2 ulp:      12.521724% (3151195)
//   >2 ulp:     71.061440% (17883197)
// 7 fma/mul
float rsqrt_wmc(float x)
{
  float h = -0.5f*x;

  x = f32_rsqrt_initial(x, 0x5F376908);
  x = x*fmaf(h,x*x, 1.5008789f);
  x = x*fmaf(h,x*x, 1.5000006f);
  return x;
}

float rsqrt_wmc_r1(float x) { return rsqrt_r1_apply(&rsqrt_wmc, x); }
float rsqrt_wmc_r2(float x) { return rsqrt_r2_apply(&rsqrt_wmc, x); }
float rsqrt_wmc_hm(float x) { return rsqrt_hm_apply(&rsqrt_wmc, x); }

//*********************************************************************
// "Modified Fast Inverse Square Root and Square Root Approximation
//  Algorithms: The Method of Switching Magic Constants"
//  Moroz, Volodymyr, Samotyy and Horyachyy, 2021

float rsqrt_msh(float x)
{
  static const uint32_t R[2] = {0x5f19e8b7, 0x5ed9e8b7};
  static const float    A[2] = {2.1499474f, 1.0749737f};
  static const float    M[2] = {0.824218631f, 2.33124256f};

  uint32_t id = (f32_to_bits(x) & 0x00800000) == 0 ? 0 : 1;

  float y;

  y = f32_rsqrt_initial(x, R[id]);
  y = M[id]*y*fmaf(-x, y*y, A[id]);

  return y;
}

float rsqrt_msh_r1(float x) { return rsqrt_r1_apply(&rsqrt_msh, x); }
float rsqrt_msh_r2(float x) { return rsqrt_r2_apply(&rsqrt_msh, x); }
float rsqrt_msh_hm(float x) { return rsqrt_hm_apply(&rsqrt_msh, x); }

//*********************************************************************
// original SSE approximation

#include <x86intrin.h>

// return hardware 1/sqrt(x) approx
static inline float rsqrt_sse(float x)
{
  return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));
}

// 4 fma/mul
float rsqrt_sse_r1(float x)
{
  float y = rsqrt_sse(x);
  return f32_rsqrt_nr_fma_step(y, -0.5f*x);
}

// 8 fma/mul
float rsqrt_sse_r2(float x)
{
  float y = rsqrt_sse(x);
  return f32_rsqrt_nr_step_cb(y, -0.5f*x, y*y);
}

// 8 fma/mul
float rsqrt_sse_hm(float x)
{
  float y = rsqrt_sse(x);
  return f32_rsqrt_hm_step(y, -0.5f*x, y*y);
}

// 13 fma/mul
float rsqrt_sse_cr(float x)
{
  float y = rsqrt_sse(x);
  float h = -0.5f*x;
  y = f32_rsqrt_hm_step(y, h, y*y);
  y = f32_rsqrt_nr_step_cb(y, h, y*y);
  return y;
}

//*********************************************************************
// Use hardware approximation: neon

// emulate neon approximation ops (std & not increased precision ones)

// ignore all special cases (including 0): don't
// need them for local testing. written to be eyeball-able to spec
float vrecpe_f32(float x)
{
  // extract the parts
  uint32_t ix = f32_to_bits(x);
  uint32_t sx = ((ix >> 31) & 0x01);
  uint32_t ex = ((ix >> 23) & 0xff);
  uint32_t mx = ((ix      ) & 0x7fffff);
  uint32_t t;

  mx = 0x100|(mx >> 15);    // add implied bit and top 8 explict
  ex = 253-ex;              // adjust the exp

  // SEE: ARM Spec - FPRecipEstimate
  t  = (mx<<1)+1;
  t  = (((1<<19)/t)+1)>>1;
  mx = (t & 0xff) << 15;

  // concat the parts back together
  ix  = (sx & 0x01) << 31;
  ix |= (ex & 0xff) << 23;
  ix |= (mx & 0x7fffff);

  return f32_from_bits(ix);
}

// max-ulp:      4947968
// correctly:    0.001293% (217)
// faithfully:   0.002837% (476)
// written to be eyeball-able to spec
float  vrsqrte_f32(float x)
{
  // yeah: lots of reductions are possible & don't care
  uint32_t ix = f32_to_bits(x);
  uint32_t ex = ((ix >> 23) & 0xff);
  uint32_t mx = ((ix      ) & 0x7fffff);
  uint32_t s = (mx & 1);

  mx >>= (15+s);
  mx  |= 1 << (8-s);

  if (mx < 256)
    mx = mx * 2 + 1;
  else
    mx = (mx|1) << 1;

  uint32_t t = 512;

  while (mx * (t+1)*(t+1) < (1<<28)) { t++; }

  mx = (t+1)/2;

  ex  = (380-ex)/2;
  ix  = (ex & 0xff) << 23;
  ix |= (mx & 0xff) << 15;

  return f32_from_bits(ix);
}

float vrsqrts_f32(float a, float b)
{
  return -0.5f*fmaf(a,b,-3.f);
}

float vrecps_f32(float a, float b)
{
  return -fmaf(a,b,-2.f);
}


//********************************************************
// start of testing related stuff
//********************************************************

static inline uint32_t u32_abs(uint32_t x)
{
  return (int32_t)x >= 0 ? x : -x;
}

// ulp distance provided a & b are finite
// and have the same sign
static inline uint32_t f32_ulp_dist_ss(float a, float b)
{
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);
  return u32_abs(ua-ub);
}

static inline uint64_t u64_abs(uint64_t x)
{
  return (int64_t)x >= 0 ? x : -x;
}

// ulp distance provided a & b are finite
// and have the same sign
static inline uint64_t f64_ulp_dist_ss(double a, double b)
{
  uint64_t ua = f64_to_bits(a);
  uint64_t ub = f64_to_bits(b);
  return u64_abs(ua-ub);
}


typedef struct {
  float (*f)(float);
  char*    name;
  char*    notes;
  uint32_t flags;
  uint32_t type;
} func_entry_t;

#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))
#define STRINGIFY(S)  STRINGIFY_(S)
#define STRINGIFY_(S) #S
#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }

func_entry_t func_table[] =
{
#if 1
  ENTRY(vrsqrte_f32),
  ENTRY(FRSR_Mon0),
  ENTRY(FRSR_Deg0),
  ENTRY(FRSR_Mon1),
  ENTRY(FRSR_Deg1),
  ENTRY(FRSR_Deg1Alt),
  ENTRY(rsqrt_sse),
  ENTRY(rsqrt_msh),
  ENTRY(rsqrt_day),
  ENTRY(rsqrt_wmc),
  ENTRY(rsqrt_sse_r2),
  ENTRY(rsqrt_sse_r1),
  ENTRY(rsqrt_day_r1),
  ENTRY(f32_rsqrt),
  ENTRY(rsqrt_day_r2),
  ENTRY(rsqrt_wmc_r1),
  ENTRY(rsqrt_sse_hm),
  ENTRY(rsqrt_day_hm),
  ENTRY(rsqrt_wmc_r2),
  ENTRY(rsqrt_wmc_hm),
  //ENTRY(f32_rsqrt_bar),
  // find ranges of these
  ENTRY(rsqrt_cb),
  ENTRY(rsqrt_day_cr),
  ENTRY(rsqrt_sse_cr),
#else
  // hack for testing one
#endif
};


typedef struct {
  double   abs;
  uint32_t max;
  uint32_t ulp[4];
} func_error_t;

// tracks total error data across multiple intervals
func_error_t func_error[LENGTHOF(func_table)] = {{0}};


// add error data from current interval to the totals
void error_to_totals(func_error_t* e)
{
  for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
    func_error[fi].ulp[0] += e->ulp[0];
    func_error[fi].ulp[1] += e->ulp[1];
    func_error[fi].ulp[2] += e->ulp[2];
    func_error[fi].ulp[3] += e->ulp[3];

    if (e->max > func_error[fi].max) func_error[fi].max = e->max;
    if (e->abs > func_error[fi].abs) func_error[fi].abs = e->abs;
  }
}

void error_dump_i(func_error_t* e)
{
  printf("|%15s|%12s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|\n",
	 "func", "e", "max ULP", "CR", "FR", "2 ULP", "> 2 ULP",
	 "CR%", "FR%", "2 ULP%","> 2 ULP%");

  printf("|%15s|%12s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|\n",
	 "---:", ":---:", "---:", "---:", "---:", "---:",
	 "---:", "---:", "---:","---:", "---:");

  for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
    uint32_t u0 = e->ulp[0];
    uint32_t u1 = e->ulp[1];
    uint32_t u2 = e->ulp[2];
    uint32_t u3 = e->ulp[3];
    uint32_t t  = (u0+u1+u2+u3);
    double   s  = 100.0/(double)t;

    if (e->abs < 9.0)
      printf("|%15s|%12.10f|%10u|%10u|%10u|%10u|%10u|%10f|%10f|%10f|%10f|\n",
	     func_table[fi].name, e->abs, e->max,
	     u0, u1, u2, u3,
	     s*u0, s*u1, s*u2, s*u3
	     );
    else
      printf("|%15s|     --     |%10u|%10u|%10u|%10u|%10u|%10f|%10f|%10f|%10f|\n",
	     func_table[fi].name, e->max,
	     u0, u1, u2, u3,
	     s*u0, s*u1, s*u2, s*u3
	     );

  }
}

void error_dump(void)
{
  printf("\nTOTAL:\n");
  error_dump_i(func_error);
}

void test_range(uint32_t x0, uint32_t x1)
{
  float f0 = f32_from_bits(x0);
  float f1 = f32_from_bits(x1);

  printf("\nchecking on [%08x,%08x] [%e,%e]\n", x0,x1,f0,f1);

  func_error_t error[LENGTHOF(func_table)] = {{0}};

  for(uint32_t ix=x0; ix<=x1; ix++) {
    float  x  = f32_from_bits(ix);
    double d  = sqrt(1.0/(double)x);
    float  cr = (float)d;

    for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
      float  r = func_table[fi].f(x);
      double e = fabs(d-(double)r) * 0x1.0p24;

      //if (e == 0.f) { error[fi].ulp[0]++; continue; };

      uint32_t ulp = f32_ulp_dist_ss(cr,r);

      if (ulp > error[fi].max) { error[fi].max = ulp; }
      if (e   > error[fi].abs) { error[fi].abs = e;   }

      if (ulp > 3) ulp = 3;

      error[fi].ulp[ulp]++;
    }
  }

  error_to_totals(error);
  error_dump_i(error);
}

//********************************************************
// validating double promotion method is correctly rounded

#if defined(NEEDS_MPFR)
#include <mpfr.h>

#if defined(VALIDATE_CR)
// worker for vaidate_double_promote
void validate_cr_i(float s, float e)
{
  mpfr_t mx,mr;

  uint32_t xs = f32_to_bits(s);
  uint32_t xe = f32_to_bits(e);

  mpfr_init2(mx, 24);
  mpfr_init2(mr, 24);

  for(uint32_t xi=xs; xi<=xe; xi++) {
    float x  = f32_from_bits(xi);

    // compute correctly rounded
    mpfr_set_flt(mx, x,   MPFR_RNDN);
    mpfr_rec_sqrt(mr, mx, MPFR_RNDN);

    float cr = mpfr_get_flt(mr, MPFR_RNDN);
    float r  = f32_rsqrt_cr(x);

    if (r == cr) continue;

    printf("  ERROR: not correctly rounded at: x=%a %a got %a\n", x, cr, r);

    xe = xi-1;
    break;
  }

  mpfr_clear(mr);
  mpfr_clear(mx);
}

// double promotion is correctly rounded for all normal
// inputs.
void vaidate_double_promote(void)
{

#if defined(VALIDATE_CR_ALL)
  printf("starting: validate f32_sqrt_cr: should take around 10 mins and I'm not going to give you any visual feedback of progress.\n");
  validate_cr_i(0, f32_max_normal);
#else
  printf("starting: validate f32_sqrt_cr: (fast/non-exhastive)\n");
  printf("  denormals\n");
  validate_cr_i(0, f32_min_normal);
  printf("  bottom of normal range\n");
  validate_cr_i(f32_min_normal, f32_min_normal*4.f);
  printf("  top of normal range\n");
  validate_cr_i(f32_max_normal/4.f, f32_max_normal);
  printf("  around 1.0\n");
  validate_cr_i(1.f/4.f, 4.f);
#endif
}
#endif

//********************************************************


#endif


int main(void)
{
#if defined(VALIDATE_CR)
  vaidate_double_promote();
#endif

  printf("<details markdown=\"1\"><summary>click for range breakdown</summary>\n\n");

#if defined(EXHAUSTIVE_TEST)
  printf("fill-in again\n");
#else
  test_range(f32_to_bits(1.f),f32_to_bits(2.f));
  test_range(f32_to_bits(2.f),f32_to_bits(4.f));
#endif

  printf("\n</details>\n\n");
  error_dump();

  return 0;
}
func	e	max ULP	CR	FR	2 ULP	> 2 ULP	CR%	FR%	2 ULP%	> 2 ULP%
vrsqrte_f32	--	4947968	103	225	216	8388065	0.001228	0.002682	0.002575	99.993515
FRSR_Mon0	--	564177	3	8	6	8388592	0.000036	0.000095	0.000072	99.999797
FRSR_Deg0	--	403258	0	0	0	8388609	0.000000	0.000000	0.000000	100.000000
FRSR_Mon1	--	14751	230	464	466	8387449	0.002742	0.005531	0.005555	99.986172
FRSR_Deg1	--	10215	321	602	609	8387077	0.003827	0.007176	0.007260	99.981737
FRSR_Deg1Alt	--	10219	305	602	589	8387113	0.003636	0.007176	0.007021	99.982166
rsqrt_sse	--	4980	1916	3792	3835	8379066	0.022840	0.045204	0.045717	99.886239
rsqrt_msh	--	1169	3067	6192	6203	8373147	0.036561	0.073814	0.073946	99.815679
rsqrt_day	--	317	9560	19141	19109	8340799	0.113964	0.228178	0.227797	99.430060
rsqrt_wmc	--	11	459691	998354	1126095	5804469	5.479943	11.901306	13.424097	69.194654
rsqrt_sse_r2	2.7195733655	3	6342724	1974273	71500	112	75.611153	23.535165	0.852346	0.001335
rsqrt_sse_r1	2.7000876479	3	6502216	1829513	56816	64	77.512446	21.809492	0.677299	0.000763
rsqrt_day_r1	0.8535512891	1	7109620	1278989	0	0	84.753265	15.246735	0.000000	0.000000
f32_rsqrt	0.8499174770	1	7109638	1278971	0	0	84.753479	15.246521	0.000000	0.000000
rsqrt_day_r2	0.5094854385	1	8353865	34744	0	0	99.585819	0.414181	0.000000	0.000000
rsqrt_wmc_r1	0.8518467862	1	7110734	1277875	0	0	84.766545	15.233455	0.000000	0.000000
rsqrt_sse_hm	0.5011967756	1	8387934	675	0	0	99.991953	0.008047	0.000000	0.000000
rsqrt_day_hm	0.5000192411	1	8388567	42	0	0	99.999499	0.000501	0.000000	0.000000
rsqrt_wmc_r2	0.5000088997	1	8388575	34	0	0	99.999595	0.000405	0.000000	0.000000
rsqrt_wmc_hm	0.5000000857	1	8388608	1	0	0	99.999988	0.000012	0.000000	0.000000
rsqrt_cb	0.4999999218	0	8388609	0	0	0	100.000000	0.000000	0.000000	0.000000
rsqrt_day_cr	0.4999999218	0	8388609	0	0	0	100.000000	0.000000	0.000000	0.000000
rsqrt_sse_cr	0.4999999218	0	8388609	0	0	0	100.000000	0.000000	0.000000	0.000000
func	e	max ULP	CR	FR	2 ULP	> 2 ULP	CR%	FR%	2 ULP%	> 2 ULP%
vrsqrte_f32	--	4947968	115	251	252	8387991	0.001371	0.002992	0.003004	99.992633
FRSR_Mon0	--	564177	6	12	11	8388580	0.000072	0.000143	0.000131	99.999654
FRSR_Deg0	--	286518	11	26	24	8388548	0.000131	0.000310	0.000286	99.999273
FRSR_Mon1	--	9631	85	129	167	8388228	0.001013	0.001538	0.001991	99.995458
FRSR_Deg1	--	7339	409	644	745	8386811	0.004876	0.007677	0.008881	99.978566
FRSR_Deg1Alt	--	7338	362	716	723	8386808	0.004315	0.008535	0.008619	99.978530
rsqrt_sse	--	4096	2692	5396	5376	8375145	0.032091	0.064325	0.064087	99.839497
rsqrt_msh	--	827	4366	8757	8701	8366785	0.052047	0.104392	0.103724	99.739838
rsqrt_day	--	213	86538	175997	184719	7941355	1.031613	2.098047	2.202022	94.668317
rsqrt_wmc	8.0949323997	8	399307	816037	899005	6274260	4.760110	9.727918	10.716973	74.794999
rsqrt_sse_r2	2.3655200340	2	6681325	1680687	26597	0	79.647591	20.035348	0.317061	0.000000
rsqrt_sse_r1	2.3163667079	2	6717604	1652745	18260	0	80.080070	19.702253	0.217676	0.000000
rsqrt_day_r1	0.7492158674	1	7484765	903844	0	0	89.225341	10.774659	0.000000	0.000000
f32_rsqrt	0.7497656401	1	7485042	903567	0	0	89.228643	10.771357	0.000000	0.000000
rsqrt_day_r2	0.5062385015	1	8374395	14214	0	0	99.830556	0.169444	0.000000	0.000000
rsqrt_wmc_r1	0.7494520154	1	7485268	903341	0	0	89.231337	10.768663	0.000000	0.000000
rsqrt_sse_hm	0.5008518882	1	8388060	549	0	0	99.993455	0.006545	0.000000	0.000000
rsqrt_day_hm	0.5000102632	1	8388582	27	0	0	99.999678	0.000322	0.000000	0.000000
rsqrt_wmc_r2	0.5000051744	1	8388583	26	0	0	99.999690	0.000310	0.000000	0.000000
rsqrt_wmc_hm	0.5000000447	1	8388607	2	0	0	99.999976	0.000024	0.000000	0.000000
rsqrt_cb	0.4999999981	0	8388609	0	0	0	100.000000	0.000000	0.000000	0.000000
rsqrt_day_cr	0.4999999981	0	8388609	0	0	0	100.000000	0.000000	0.000000	0.000000
rsqrt_sse_cr	0.4999999981	0	8388609	0	0	0	100.000000	0.000000	0.000000	0.000000
func	e	max ULP	CR	FR	2 ULP	> 2 ULP	CR%	FR%	2 ULP%	> 2 ULP%
vrsqrte_f32	--	4947968	218	476	468	16776056	0.001299	0.002837	0.002789	99.993074
FRSR_Mon0	--	564177	9	20	17	16777172	0.000054	0.000119	0.000101	99.999726
FRSR_Deg0	--	403258	11	26	24	16777157	0.000066	0.000155	0.000143	99.999636
FRSR_Mon1	--	14751	315	593	633	16775677	0.001878	0.003535	0.003773	99.990815
FRSR_Deg1	--	10215	730	1246	1354	16773888	0.004351	0.007427	0.008070	99.980152
FRSR_Deg1Alt	--	10219	667	1318	1312	16773921	0.003976	0.007856	0.007820	99.980348
rsqrt_sse	--	4980	4608	9188	9211	16754211	0.027466	0.054765	0.054902	99.862868
rsqrt_msh	--	1169	7433	14949	14904	16739932	0.044304	0.089103	0.088835	99.777758
rsqrt_day	--	317	96098	195138	203828	16282154	0.572789	1.163113	1.214909	97.049189
rsqrt_wmc	--	11	858998	1814391	2025100	12078729	5.120026	10.814612	12.070535	71.994827
rsqrt_sse_r2	2.7195733655	3	13024049	3654960	98097	112	77.629372	21.785257	0.584704	0.000668
rsqrt_sse_r1	2.7000876479	3	13219820	3482258	75076	64	78.796258	20.755873	0.447488	0.000381
rsqrt_day_r1	0.8535512891	1	14594385	2182833	0	0	86.989303	13.010697	0.000000	0.000000
f32_rsqrt	0.8499174770	1	14594680	2182538	0	0	86.991061	13.008939	0.000000	0.000000
rsqrt_day_r2	0.5094854385	1	16728260	48958	0	0	99.708188	0.291812	0.000000	0.000000
rsqrt_wmc_r1	0.8518467862	1	14596002	2181216	0	0	86.998941	13.001059	0.000000	0.000000
rsqrt_sse_hm	0.5011967756	1	16775994	1224	0	0	99.992704	0.007296	0.000000	0.000000
rsqrt_day_hm	0.5000192411	1	16777149	69	0	0	99.999589	0.000411	0.000000	0.000000
rsqrt_wmc_r2	0.5000088997	1	16777158	60	0	0	99.999642	0.000358	0.000000	0.000000
rsqrt_wmc_hm	0.5000000857	1	16777215	3	0	0	99.999982	0.000018	0.000000	0.000000
rsqrt_cb	0.4999999981	0	16777218	0	0	0	100.000000	0.000000	0.000000	0.000000
rsqrt_day_cr	0.4999999981	0	16777218	0	0	0	100.000000	0.000000	0.000000	0.000000
rsqrt_sse_cr	0.4999999981	0	16777218	0	0	0	100.000000	0.000000	0.000000	0.000000
	// Public Domain under http://unlicense.org, see link for details.

	// compile with (or equiv):
	// clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno rsqrt.c -o rsqrt -lm

	// VALIDATE_CR options additionally requires MPFR: -lmpfr

	// This is intended to be a hackable testbed. Only a subset of
	// included routines are ran as is (those that seem potentially
	// interesting to me). Also note that a number of implementations
	// differ from those in the original papers.


	//********************************************************
	// CONFIG STUFF HERE

	// not really exhaustive (add notes). actually I removed this
	// because I'm too lazy to check the domains of the inputs.
	// (general slightly smaller than all normal inputs). yeah
	// this is useful information.
	//#define EXHAUSTIVE_TEST

	// following options requires: MPFR installed (https://www.mpfr.org/)
	// if defined.

	// validate correct rounding of double promotion method (for normal
	// and denormal input)
	//#define VALIDATE_CR

	// if defined (with VALIDATE_CR) then tests all (about 10 minutes)
	//#define VALIDATE_CR_ALL

	// END OF CONFIG
	//********************************************************


	#if defined(VALIDATE_CR)
	#define NEEDS_MPFR
	#endif

	#include <stdint.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <math.h>
	#include <string.h>
	#include <assert.h>


	//********************************************************

	const float f32_min_normal = 0x1.0p-126f;
	const float f32_max_normal = 0x1.fffffep+127f;

	static inline uint32_t f32_to_bits(float x)
	{
	uint32_t u; memcpy(&u,&x,4); return u;
	}

	inline float f32_from_bits(uint32_t x)
	{
	float u; memcpy(&u,&x,4); return u;
	}

	static inline uint64_t f64_to_bits(double x)
	{
	uint64_t u; memcpy(&u, &x, 8); return u;
	}

	static inline double f64_from_bits(uint64_t x)
	{
	double f; memcpy(&f, &x, 8); return f;
	}

	// JUST SAY NO TO MATH ERRNO

	// to cut some of the pain of math errno not being disabled
	// (-fno-math-errno). But you really should do that. Unless
	// your using visual-c: Then I feel your pain and I can't
	// think of anything better to do than this. (and nag
	// MS to make a work around)
	inline float f32_sqrt(float a)
	{
	#if defined(__NO_MATH_ERRNO__) \|\| defined(_M_FP_FAST)
	return sqrtf(a);
	#elif defined(F32_INTEL)
	return F32_SSE_WRAP(_mm_sqrt_ss, a);
	#elif defined(__ARM_ARCH)
	return __sqrtf(a);
	#else
	// do nothing for warning ATM
	#endif
	}

	//********************************************************

	// faithfully rounded.
	static inline float f32_rsqrt(float x) { return f32_sqrt(1.f/x); }

	// promote/demote double is correctly rounded
	static inline float f32_rsqrt_cr(float x)
	{
	return (float)sqrt(1.0/(double)x);
	}

	// DEAD
	// dep-chain len: 2
	// correctly: 74.013561% (12417415)
	// faithfully: 25.986439% (4359801)
	// uiCA predicted throughput: 23.00
	float rsqrt_n1(float x) { return f32_sqrt(x)/x; }

	// DEAD
	// dep-chain len: 2
	// correctly: 73.995733% (12414424)
	// faithfully: 26.004267% (4362792)
	// uiCA predicted throughput: 23.00
	float rsqrt_n2(float x) { return 1.f/f32_sqrt(x); }

	// MOSTLY DEAD
	// NOTE: if sqrt(x) and 1/x "can" be computed in parallel
	// then this becomes potentially interesting. This is
	// not the case in current mainstream CPUs.

	// dep-chain len: 2
	// correctly: 64.843583% (10878948)
	// faithfully: 34.837395% (5844745)
	// 2 ulp: 0.319022% (53523)
	float rsqrt_n3(float x) { return f32_sqrt(x)*(1.f/x); }

	//********************************************************

	// Newton-Raphson step for 1/sqrt(x) using FMA when
	// we haven't computed (or approximated) 1/x
	// y = estimated in (y_n)
	// h = -x/2
	// { don't use see: XX}
	inline float f32_rsqrt_nr_step_common(float y, float h)
	{
	return yfmaf(h,yy, 1.5f);
	}

	// rename
	static inline float f32_rsqrt_nr_fma_step(float y, float h)
	{
	float r = y*y;
	float v = fmaf(h,r, 0.5f);
	return fmaf(y,v,y);
	}

	// Newton-Raphson step for 1/sqrt(x) using FMA when
	// y = estimate in (y_n)
	// h = -x/2
	// r = ~1/x
	// RENAME AND NO
	inline float f32_rsqrt_nr_step_cb(float y, float h, float r)
	{
	float s = fmaf(h,r,0.5f);
	float t = fmaf(y,y, -r);
	float v = fmaf(h,t, s);
	return fmaf(y,v,y);
	}

	// Halley method step for 1/sqrt(x)
	// y = estimate in (y_n)
	// h = -x/2
	// r = ~1/x
	inline float f32_rsqrt_hm_step(float y, float h, float r)
	{
	float s = fmaf(r,h,0.5f);
	float t = fmaf(y,y,-r);
	float v = fmaf(h,t, s);
	float w = fmaf(1.5f*v,v,v);
	return fmaf(y,w,y);
	}

	//********************************************************
	// expansion helpers

	static inline float rsqrt_r1_apply(float (*f)(float), float x)
	{
	float y = f(x);
	return f32_rsqrt_nr_fma_step(y, -0.5f*x);
	}

	static inline float rsqrt_r2_apply(float (*f)(float), float x)
	{
	float y = f(x);
	return f32_rsqrt_nr_step_cb(y, -0.5fx, yy);
	}

	static inline float rsqrt_hm_apply(float (*f)(float), float x)
	{
	float y = f(x);
	return f32_rsqrt_hm_step(y, -0.5fx, yy);
	}

	//********************************************************

	// Portable versions by Carlos F. Borges

	// "Fast Compensated Algorithms for the Reciprocal Square Root, the Reciprocal Hypotenuse, and Givens Rotations", Carlos F. Borges, 2021
	// https://arxiv.org/abs/2103.08694
	// --
	// additional analysis:
	// "High-level algorithms for correctly-rounded reciprocal square roots", Borges, Jeannerod & Muller, 2022
	// https://hal.inria.fr/hal-03728088

	// dep-chain len: 5
	// correctly: 99.999994% (16777215)
	// faithfully: 0.000006% (1)
	// Borges Algorithm 2 (rsqrtNewton in Borges/Jeannerod/Muller)
	// uiCA: https://bit.ly/3OkNAk5
	// predicted throughput: 35.00
	float rsqrt_hq(float x)
	{
	float r = 1.f/x;
	float h = -0.5f*x;

	x = f32_sqrt(r);
	x = f32_rsqrt_nr_step_cb(x,h,r);

	return x;
	}

	// correctly rounded (in round to even/ties to nearest)
	// on [0x1p-125, 0x1.fffffcp+127] (almost all normal numbers)
	// Borges Algorithm 3 (rsqrtHalley in Borges/Jeannerod/Muller)
	// uiCA: https://bit.ly/3AzVQXJ
	// predicted throughput: 40.00
	float rsqrt_cb(float x)
	{
	float r = 1.f/x;
	float h = -0.5f*x;

	x = f32_sqrt(r);
	x = f32_rsqrt_hm_step(x,h,r);

	return x;
	}


	//********************************************************
	// so-call "bithack" initial steps

	static inline float f32_rsqrt_initial(float x, uint32_t K)
	{
	return f32_from_bits(K - (f32_to_bits(x) >> 1));
	}

	// Day observes there's a second choice which gives some
	// extra wiggle room in optimizing the "magic" K
	static inline float f32_rsqrt_initial_md(float x, uint32_t K)
	{
	return f32_from_bits((K-f32_to_bits(x)) >> 1);
	}

	//********************************************************

	// Mike Day core routines as in paper. Without star is paper
	// and contractions are disabled. Starred versions are explict FMAs
	// where version choosen uses the greedy measure to minimize
	// the counts of >2 ulp results.
	// \| func \|max ULP\| CR\| FR\| 2 ULP\| > 2 ULP\|
	// \| ---: \| ---:\| ---:\| ---:\| ---:\| ---:\|
	// \| FRSR_Mon0 \| 564177\| 12\| 28\| 23\| 25165761\|
	// \| FRSR_Deg0 \| 403258\| 11\| 26\| 24\| 25165763\|
	// \| FRSR_Mon1 \| 14751\| 557\| 1070\| 1063\| 25163134\|
	// \| FRSR_Mon1*\| 14751\| 545\| 1057\| 1099\| 25163123\|
	// \| FRSR_Deg1 \| 10215\| 941\| 1802\| 1939\| 25161142\|
	// \| FRSR_Deg1*\| 10215\| 1051\| 1848\| 1963\| 25160962\|
	// \| FRSR_Deg1Alt \| 10219\| 874\| 1858\| 2033\| 25161059\|
	// \| FRSR_Deg1Alt*\| 10219\| 972\| 1920\| 1901\| 25161031\|
	// \| FRSR_Mon2 \| 317\|105658\| 214279\| 222937\| 24622950\|
	// \| FRSR_Mon2*\| 317\|105658\| 214279\| 222937\| 24622950\|

	// listing 2 (FRSR_Mon0)
	// max-ulp: 564177
	float FRSR_Mon0(float x)
	{
	return f32_rsqrt_initial(x, 0x5F37642F);
	}

	// listing 3 (FRSR_Deg0)
	// max-ulp: 403258
	float FRSR_Deg0(float x)
	{
	return 0.79247999f * f32_rsqrt_initial_md(x, 0xBEBFFDAA);
	}

	// listing 4 (FRSR_Mon1)
	// max-ulp: 14751
	// \| CR\| FR \| 2 ULP\| > 2 ULP \|
	// \| 557\| 1070\| 1063\| 25163134\| 1
	// \| 497\| 1094\| 1097\| 25163136\| 2
	// \| 545\| 1057\| 1099\| 25163123\| 3
	// 3 fma/mul
	float FRSR_Mon1(float x)
	{
	float y = f32_rsqrt_initial_md(x, 0xBE167122);

	//return y * (1.8909901f - xyy); // 1
	//return y * fmaf(-x, y*y, 1.8909901f); // 2
	return y * fmaf(-y, x*y, 1.8909901f); // 3
	}

	// listing 5 (FRSR_Deg1)
	// max-ulp: 10215
	// \| CR\| FR \| 2 ULP\| > 2 ULP \|
	// \| 941\| 1802\| 1939\| 25161142\| 1
	// \| 992\| 1907\| 1882\| 25161043\| 2
	// \| 1051\| 1848\| 1963\| 25160962\| 3
	// \| 976\| 1907\| 1894\| 25161047\| 4
	// 4 fma/mul
	float FRSR_Deg1(float x)
	{
	static const float c0 = 1.1893165f;
	static const float c1 = -0.24889956f;

	float y = f32_rsqrt_initial(x, 0x5F5FFF00);

	//return y * (c0 + xyy*c1); // 1
	//return y * fmaf(x, yyc1, c0); // 2
	return y * fmaf(xy, yc1, c0); // 3
	//return y * fmaf(xc1,yy, c0); // 4
	}

	// listing 6 (FRSR_Deg1Alt)
	// max-ulp: 10219
	// \| CR\| FR \| 2 ULP\| > 2 ULP \|
	// \| 874\| 1858\| 2033\| 25161059\| 1
	// \| 972\| 1920\| 1901\| 25161031\| 2
	// \| 950\| 1991\| 1908\| 25160975\| 3
	// \| 972\| 1920\| 1901\| 25161031\| 4
	// 4 fma/mul
	float FRSR_Deg1Alt(float x)
	{
	static const float c0 = 1.1891762f;
	static const float c1 = -0.24881148f;

	float y = f32_rsqrt_initial(x, 0x5F6004CC);

	//return y * (c0 + yyx*c1); // 1
	//return y * fmaf(yy, xc1, c0); // 2
	//return y * fmaf(xy, yc1, c0); // 3
	return y * fmaf(xc1,yy, c0); // 4
	}

	// listing 7 (FRSR_Mon2)
	// max-ulp: 317
	// \| CR\| FR \| 2 ULP\| > 2 ULP \|
	// \| 105658\| 214279\| 222937\| 24622950\| 1
	// \| 105658\| 214279\| 222937\| 24622950\| 2
	// \| 106447\| 213978\| 221946\| 24623453\| 3
	// 4 fma/mul, 1 add
	float rsqrt_day(float x)
	{
	static const float c0 = 2.2825186f;
	static const float c1 = -2.253305f;

	float y = f32_rsqrt_initial(x, 0x5F11107D);
	float z = xyy;

	//return y * (c0 + z *(z+c1)); // 1
	return y * fmaf(z, z+c1, c0); // 2
	//return y * fmaf(z,z,fmaf(z,c1,c0)); // 3
	}

	// max-ulp: 2
	// correctly: 70.604507% (17768206)
	// faithfully: 29.345020% (7384916)
	// 2 ulp: 0.050473% (12702)
	// 8 fma/mul, 1 add
	float rsqrt_day_r1(float x) { return rsqrt_r1_apply(&rsqrt_day, x); }

	// max-ulp: 1
	// correctly: 99.667398% (25082122)
	// faithfully: 0.332602% (83702)
	// 10 fma/mul, 1 add
	float rsqrt_day_r2(float x) { return rsqrt_r2_apply(&rsqrt_day, x); }

	// max-ulp: 1
	// correctly: 99.999559% (25165713)
	// faithfully: 0.000441% (111)
	// 12 fma/mul, 1 add
	float rsqrt_day_hm(float x) { return rsqrt_hm_apply(&rsqrt_day, x); }

	float rsqrt_day_cr(float x)
	{
	float y = rsqrt_day(x);
	float h = -0.5f*x;
	y = f32_rsqrt_hm_step(y, h, y*y);
	//y = f32_rsqrt_nr_fma_step(y, -0.5f*x); // no good with this step
	y = f32_rsqrt_nr_step_cb(y, h, y*y); // need this
	return y;
	}

	//*********************************************************************

	// Walczyk, Moroz, Cieslinski, Cezary:
	// "InvSqrt2" - modified to use FMA
	// no constant refinement
	// max-ulp: 11
	// correctly: 5.120015% (858996)
	// faithfully: 11.176844% (1814391)
	// 2 ulp: 12.521724% (3151195)
	// >2 ulp: 71.061440% (17883197)
	// 7 fma/mul
	float rsqrt_wmc(float x)
	{
	float h = -0.5f*x;

	x = f32_rsqrt_initial(x, 0x5F376908);
	x = xfmaf(h,xx, 1.5008789f);
	x = xfmaf(h,xx, 1.5000006f);
	return x;
	}

	float rsqrt_wmc_r1(float x) { return rsqrt_r1_apply(&rsqrt_wmc, x); }
	float rsqrt_wmc_r2(float x) { return rsqrt_r2_apply(&rsqrt_wmc, x); }
	float rsqrt_wmc_hm(float x) { return rsqrt_hm_apply(&rsqrt_wmc, x); }

	//*********************************************************************
	// "Modified Fast Inverse Square Root and Square Root Approximation
	// Algorithms: The Method of Switching Magic Constants"
	// Moroz, Volodymyr, Samotyy and Horyachyy, 2021

	float rsqrt_msh(float x)
	{
	static const uint32_t R[2] = {0x5f19e8b7, 0x5ed9e8b7};
	static const float A[2] = {2.1499474f, 1.0749737f};
	static const float M[2] = {0.824218631f, 2.33124256f};

	uint32_t id = (f32_to_bits(x) & 0x00800000) == 0 ? 0 : 1;

	float y;

	y = f32_rsqrt_initial(x, R[id]);
	y = M[id]yfmaf(-x, y*y, A[id]);

	return y;
	}

	float rsqrt_msh_r1(float x) { return rsqrt_r1_apply(&rsqrt_msh, x); }
	float rsqrt_msh_r2(float x) { return rsqrt_r2_apply(&rsqrt_msh, x); }
	float rsqrt_msh_hm(float x) { return rsqrt_hm_apply(&rsqrt_msh, x); }

	//*********************************************************************
	// original SSE approximation

	#include <x86intrin.h>

	// return hardware 1/sqrt(x) approx
	static inline float rsqrt_sse(float x)
	{
	return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));
	}

	// 4 fma/mul
	float rsqrt_sse_r1(float x)
	{
	float y = rsqrt_sse(x);
	return f32_rsqrt_nr_fma_step(y, -0.5f*x);
	}

	// 8 fma/mul
	float rsqrt_sse_r2(float x)
	{
	float y = rsqrt_sse(x);
	return f32_rsqrt_nr_step_cb(y, -0.5fx, yy);
	}

	// 8 fma/mul
	float rsqrt_sse_hm(float x)
	{
	float y = rsqrt_sse(x);
	return f32_rsqrt_hm_step(y, -0.5fx, yy);
	}

	// 13 fma/mul
	float rsqrt_sse_cr(float x)
	{
	float y = rsqrt_sse(x);
	float h = -0.5f*x;
	y = f32_rsqrt_hm_step(y, h, y*y);
	y = f32_rsqrt_nr_step_cb(y, h, y*y);
	return y;
	}

	//*********************************************************************
	// Use hardware approximation: neon

	// emulate neon approximation ops (std & not increased precision ones)

	// ignore all special cases (including 0): don't
	// need them for local testing. written to be eyeball-able to spec
	float vrecpe_f32(float x)
	{
	// extract the parts
	uint32_t ix = f32_to_bits(x);
	uint32_t sx = ((ix >> 31) & 0x01);
	uint32_t ex = ((ix >> 23) & 0xff);
	uint32_t mx = ((ix ) & 0x7fffff);
	uint32_t t;

	mx = 0x100\|(mx >> 15); // add implied bit and top 8 explict
	ex = 253-ex; // adjust the exp

	// SEE: ARM Spec - FPRecipEstimate
	t = (mx<<1)+1;
	t = (((1<<19)/t)+1)>>1;
	mx = (t & 0xff) << 15;

	// concat the parts back together
	ix = (sx & 0x01) << 31;
	ix \|= (ex & 0xff) << 23;
	ix \|= (mx & 0x7fffff);

	return f32_from_bits(ix);
	}

	// max-ulp: 4947968
	// correctly: 0.001293% (217)
	// faithfully: 0.002837% (476)
	// written to be eyeball-able to spec
	float vrsqrte_f32(float x)
	{
	// yeah: lots of reductions are possible & don't care
	uint32_t ix = f32_to_bits(x);
	uint32_t ex = ((ix >> 23) & 0xff);
	uint32_t mx = ((ix ) & 0x7fffff);
	uint32_t s = (mx & 1);

	mx >>= (15+s);
	mx \|= 1 << (8-s);

	if (mx < 256)
	mx = mx * 2 + 1;
	else
	mx = (mx\|1) << 1;

	uint32_t t = 512;

	while (mx * (t+1)*(t+1) < (1<<28)) { t++; }

	mx = (t+1)/2;

	ex = (380-ex)/2;
	ix = (ex & 0xff) << 23;
	ix \|= (mx & 0xff) << 15;

	return f32_from_bits(ix);
	}

	float vrsqrts_f32(float a, float b)
	{
	return -0.5f*fmaf(a,b,-3.f);
	}

	float vrecps_f32(float a, float b)
	{
	return -fmaf(a,b,-2.f);
	}


	//********************************************************
	// start of testing related stuff
	//********************************************************

	static inline uint32_t u32_abs(uint32_t x)
	{
	return (int32_t)x >= 0 ? x : -x;
	}

	// ulp distance provided a & b are finite
	// and have the same sign
	static inline uint32_t f32_ulp_dist_ss(float a, float b)
	{
	uint32_t ua = f32_to_bits(a);
	uint32_t ub = f32_to_bits(b);
	return u32_abs(ua-ub);
	}

	static inline uint64_t u64_abs(uint64_t x)
	{
	return (int64_t)x >= 0 ? x : -x;
	}

	// ulp distance provided a & b are finite
	// and have the same sign
	static inline uint64_t f64_ulp_dist_ss(double a, double b)
	{
	uint64_t ua = f64_to_bits(a);
	uint64_t ub = f64_to_bits(b);
	return u64_abs(ua-ub);
	}


	typedef struct {
	float (*f)(float);
	char* name;
	char* notes;
	uint32_t flags;
	uint32_t type;
	} func_entry_t;

	#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))
	#define STRINGIFY(S) STRINGIFY_(S)
	#define STRINGIFY_(S) #S
	#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }

	func_entry_t func_table[] =
	{
	#if 1
	ENTRY(vrsqrte_f32),
	ENTRY(FRSR_Mon0),
	ENTRY(FRSR_Deg0),
	ENTRY(FRSR_Mon1),
	ENTRY(FRSR_Deg1),
	ENTRY(FRSR_Deg1Alt),
	ENTRY(rsqrt_sse),
	ENTRY(rsqrt_msh),
	ENTRY(rsqrt_day),
	ENTRY(rsqrt_wmc),
	ENTRY(rsqrt_sse_r2),
	ENTRY(rsqrt_sse_r1),
	ENTRY(rsqrt_day_r1),
	ENTRY(f32_rsqrt),
	ENTRY(rsqrt_day_r2),
	ENTRY(rsqrt_wmc_r1),
	ENTRY(rsqrt_sse_hm),
	ENTRY(rsqrt_day_hm),
	ENTRY(rsqrt_wmc_r2),
	ENTRY(rsqrt_wmc_hm),
	//ENTRY(f32_rsqrt_bar),
	// find ranges of these
	ENTRY(rsqrt_cb),
	ENTRY(rsqrt_day_cr),
	ENTRY(rsqrt_sse_cr),
	#else
	// hack for testing one
	#endif
	};


	typedef struct {
	double abs;
	uint32_t max;
	uint32_t ulp[4];
	} func_error_t;

	// tracks total error data across multiple intervals
	func_error_t func_error[LENGTHOF(func_table)] = {{0}};


	// add error data from current interval to the totals
	void error_to_totals(func_error_t* e)
	{
	for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
	func_error[fi].ulp[0] += e->ulp[0];
	func_error[fi].ulp[1] += e->ulp[1];
	func_error[fi].ulp[2] += e->ulp[2];
	func_error[fi].ulp[3] += e->ulp[3];

	if (e->max > func_error[fi].max) func_error[fi].max = e->max;
	if (e->abs > func_error[fi].abs) func_error[fi].abs = e->abs;
	}
	}

	void error_dump_i(func_error_t* e)
	{
	printf("\|%15s\|%12s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|\n",
	"func", "e", "max ULP", "CR", "FR", "2 ULP", "> 2 ULP",
	"CR%", "FR%", "2 ULP%","> 2 ULP%");

	printf("\|%15s\|%12s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|\n",
	"---:", ":---:", "---:", "---:", "---:", "---:",
	"---:", "---:", "---:","---:", "---:");

	for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
	uint32_t u0 = e->ulp[0];
	uint32_t u1 = e->ulp[1];
	uint32_t u2 = e->ulp[2];
	uint32_t u3 = e->ulp[3];
	uint32_t t = (u0+u1+u2+u3);
	double s = 100.0/(double)t;

	if (e->abs < 9.0)
	printf("\|%15s\|%12.10f\|%10u\|%10u\|%10u\|%10u\|%10u\|%10f\|%10f\|%10f\|%10f\|\n",
	func_table[fi].name, e->abs, e->max,
	u0, u1, u2, u3,
	su0, su1, su2, su3
	);
	else
	printf("\|%15s\| -- \|%10u\|%10u\|%10u\|%10u\|%10u\|%10f\|%10f\|%10f\|%10f\|\n",
	func_table[fi].name, e->max,
	u0, u1, u2, u3,
	su0, su1, su2, su3
	);

	}
	}

	void error_dump(void)
	{
	printf("\nTOTAL:\n");
	error_dump_i(func_error);
	}

	void test_range(uint32_t x0, uint32_t x1)
	{
	float f0 = f32_from_bits(x0);
	float f1 = f32_from_bits(x1);

	printf("\nchecking on [%08x,%08x] [%e,%e]\n", x0,x1,f0,f1);

	func_error_t error[LENGTHOF(func_table)] = {{0}};

	for(uint32_t ix=x0; ix<=x1; ix++) {
	float x = f32_from_bits(ix);
	double d = sqrt(1.0/(double)x);
	float cr = (float)d;

	for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
	float r = func_table[fi].f(x);
	double e = fabs(d-(double)r) * 0x1.0p24;

	//if (e == 0.f) { error[fi].ulp[0]++; continue; };

	uint32_t ulp = f32_ulp_dist_ss(cr,r);

	if (ulp > error[fi].max) { error[fi].max = ulp; }
	if (e > error[fi].abs) { error[fi].abs = e; }

	if (ulp > 3) ulp = 3;

	error[fi].ulp[ulp]++;
	}
	}

	error_to_totals(error);
	error_dump_i(error);
	}

	//********************************************************
	// validating double promotion method is correctly rounded

	#if defined(NEEDS_MPFR)
	#include <mpfr.h>

	#if defined(VALIDATE_CR)
	// worker for vaidate_double_promote
	void validate_cr_i(float s, float e)
	{
	mpfr_t mx,mr;

	uint32_t xs = f32_to_bits(s);
	uint32_t xe = f32_to_bits(e);

	mpfr_init2(mx, 24);
	mpfr_init2(mr, 24);

	for(uint32_t xi=xs; xi<=xe; xi++) {
	float x = f32_from_bits(xi);

	// compute correctly rounded
	mpfr_set_flt(mx, x, MPFR_RNDN);
	mpfr_rec_sqrt(mr, mx, MPFR_RNDN);

	float cr = mpfr_get_flt(mr, MPFR_RNDN);
	float r = f32_rsqrt_cr(x);

	if (r == cr) continue;

	printf(" ERROR: not correctly rounded at: x=%a %a got %a\n", x, cr, r);

	xe = xi-1;
	break;
	}

	mpfr_clear(mr);
	mpfr_clear(mx);
	}

	// double promotion is correctly rounded for all normal
	// inputs.
	void vaidate_double_promote(void)
	{

	#if defined(VALIDATE_CR_ALL)
	printf("starting: validate f32_sqrt_cr: should take around 10 mins and I'm not going to give you any visual feedback of progress.\n");
	validate_cr_i(0, f32_max_normal);
	#else
	printf("starting: validate f32_sqrt_cr: (fast/non-exhastive)\n");
	printf(" denormals\n");
	validate_cr_i(0, f32_min_normal);
	printf(" bottom of normal range\n");
	validate_cr_i(f32_min_normal, f32_min_normal*4.f);
	printf(" top of normal range\n");
	validate_cr_i(f32_max_normal/4.f, f32_max_normal);
	printf(" around 1.0\n");
	validate_cr_i(1.f/4.f, 4.f);
	#endif
	}
	#endif

	//********************************************************


	#endif



	int main(void)
	{
	#if defined(VALIDATE_CR)
	vaidate_double_promote();
	#endif

	printf("<details markdown=\"1\"><summary>click for range breakdown</summary>\n\n");

	#if defined(EXHAUSTIVE_TEST)
	printf("fill-in again\n");
	#else
	test_range(f32_to_bits(1.f),f32_to_bits(2.f));
	test_range(f32_to_bits(2.f),f32_to_bits(4.f));
	#endif

	printf("\n</details>\n\n");
	error_dump();

	return 0;
	}