Marc-B-Reynolds/output.txt

## output.txt
----------------------------------------------------------------------------
starting: vrecpe_f32 on [3f800000, 40000000]
// min/max ulp   -45502 44978
// abs max ulp   45502.375051
// correctly:    0.002611% (219)
// faithfully:   0.005555% (466)
//   2 ulp:      0.005436% (456)
//   3 ulp:      0.005460% (458)
//   4 ulp:      0.005329% (447)
//   5 ulp:      0.005531% (464)
//  >5 ulp:     99.970078% (8386099)

----------------------------------------------------------------------------
starting: arm_nr_op_1 on [3f800000, 40000000]
// min/max ulp   -1 129
// abs max ulp   128.903094
// correctly:   10.676442% (895605)
// faithfully:  11.139952% (934487)
//   2 ulp:      6.347620% (532477)
//   3 ulp:      4.953718% (415548)
//   4 ulp:      4.155027% (348549)
//   5 ulp:      3.647220% (305951)
//  >5 ulp:     59.080021% (4955992)

----------------------------------------------------------------------------
starting: arm_nr_f2_1 on [3f800000, 40000000]
// min/max ulp   0 128
// abs max ulp   128.090785
// correctly:   12.101339% (1015134)
// faithfully:   9.859215% (827051)
//   2 ulp:      6.272411% (526168)
//   3 ulp:      4.919719% (412696)
//   4 ulp:      4.147815% (347944)
//   5 ulp:      3.636097% (305018)
//  >5 ulp:     59.063404% (4954598)

----------------------------------------------------------------------------
starting: arm_nr_op_2 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   1.487251
// correctly:   67.627648% (5673019)
// faithfully:  32.372352% (2715590)

----------------------------------------------------------------------------
starting: arm_nr_f2_2 on [3f800000, 40000000]
// min/max ulp   0 1
// abs max ulp   0.500814
// correctly:   99.994457% (8388144)
// faithfully:   0.005543% (465)

----------------------------------------------------------------------------
starting: arm_hm_1 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   0.855433
// correctly:   97.301376% (8162232)
// faithfully:   2.698624% (226377)

----------------------------------------------------------------------------
starting: sse_rcp_approx on [3f800000, 40000000]
// min/max ulp   -4996 4989
// abs max ulp   4995.549501
// correctly:    0.028241% (2369)
// faithfully:   0.057113% (4791)
//   2 ulp:      0.056624% (4750)
//   3 ulp:      0.057054% (4786)
//   4 ulp:      0.056839% (4768)
//   5 ulp:      0.057185% (4797)
//  >5 ulp:     99.686945% (8362348)

----------------------------------------------------------------------------
starting: sse_nr_s1_1 on [3f800000, 40000000]
// min/max ulp   -2 3
// abs max ulp   3.327252
// correctly:   58.715444% (4925409)
// faithfully:  39.751287% (3334580)
//   2 ulp:      1.516497% (127213)
//   3 ulp:      0.016773% (1407)

----------------------------------------------------------------------------
starting: sse_nr_s2_1 on [3f800000, 40000000]
// min/max ulp   -1 3
// abs max ulp   2.756861
// correctly:   68.920032% (5781432)
// faithfully:  30.598839% (2566817)
//   2 ulp:      0.480151% (40278)
//   3 ulp:      0.000978% (82)

----------------------------------------------------------------------------
starting: sse_nr_fp_1 on [3f800000, 40000000]
// min/max ulp   -1 3
// abs max ulp   2.848300
// correctly:   68.949548% (5783908)
// faithfully:  30.556294% (2563248)
//   2 ulp:      0.493121% (41366)
//   3 ulp:      0.001037% (87)

----------------------------------------------------------------------------
starting: sse_nr_f2_1 on [3f800000, 40000000]
// min/max ulp   0 2
// abs max ulp   1.960132
// correctly:   84.322967% (7073524)
// faithfully:  15.620623% (1310353)
//   2 ulp:      0.056410% (4732)

----------------------------------------------------------------------------
starting: sse_nr_s1_2 on [3f800000, 40000000]
// min/max ulp   -2 1
// abs max ulp   1.811487
// correctly:   70.040599% (5875432)
// faithfully:  29.943165% (2511815)
//   2 ulp:      0.016236% (1362)

----------------------------------------------------------------------------
starting: sse_nr_s2_2 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   1.248996
// correctly:   83.297946% (6987539)
// faithfully:  16.702054% (1401070)

----------------------------------------------------------------------------
starting: sse_nr_fp_2 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   1.249992
// correctly:   78.509786% (6585879)
// faithfully:  21.490214% (1802730)

----------------------------------------------------------------------------
starting: sse_nr_f2_2 on [3f800000, 40000000]
// min/max ulp   0 1
// abs max ulp   0.500000
// correctly:   99.999988% (8388608)
// faithfully:   0.000012% (1)

----------------------------------------------------------------------------
starting: sse_hm_1 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   0.500378
// correctly:   99.996150% (8388286)
// faithfully:   0.003850% (323)

----------------------------------------------------------------------------
starting: bit_rcp_approx on [3f800000, 40000000]
// min/max ulp   -159498 1279760
// abs max ulp   1279760.000000
// correctly:    0.000107% (9)
// faithfully:   0.000203% (17)
//   2 ulp:      0.000203% (17)
//   3 ulp:      0.000215% (18)
//   4 ulp:      0.000203% (17)
//   5 ulp:      0.000203% (17)
//  >5 ulp:     99.998868% (8388514)

----------------------------------------------------------------------------
starting: bit_nr_s1_1 on [3f800000, 40000000]
// min/max ulp   -2 97621
// abs max ulp   97621.000002
// correctly:    0.375867% (31530)
// faithfully:   0.473118% (39688)
//   2 ulp:      0.263309% (22088)
//   3 ulp:      0.208855% (17520)
//   4 ulp:      0.178814% (15000)
//   5 ulp:      0.159001% (13338)
//  >5 ulp:     98.341036% (8249445)

----------------------------------------------------------------------------
starting: bit_nr_s2_1 on [3f800000, 40000000]
// min/max ulp   -1 97620
// abs max ulp   97620.000002
// correctly:    0.414848% (34800)
// faithfully:   0.442111% (37087)
//   2 ulp:      0.258505% (21685)
//   3 ulp:      0.207484% (17405)
//   4 ulp:      0.178170% (14946)
//   5 ulp:      0.159025% (13340)
//  >5 ulp:     98.339856% (8249346)

----------------------------------------------------------------------------
starting: bit_nr_fp_1 on [3f800000, 40000000]
// min/max ulp   -1 97620
// abs max ulp   97620.000009
// correctly:    0.414467% (34768)
// faithfully:   0.442910% (37154)
//   2 ulp:      0.257874% (21632)
//   3 ulp:      0.207269% (17387)
//   4 ulp:      0.178623% (14984)
//   5 ulp:      0.158823% (13323)
//  >5 ulp:     98.340035% (8249361)

----------------------------------------------------------------------------
starting: bit_nr_f2_1 on [3f800000, 40000000]
// min/max ulp   0 97620
// abs max ulp   97620.000000
// correctly:    0.472355% (39624)
// faithfully:   0.391078% (32806)
//   2 ulp:      0.254703% (21366)
//   3 ulp:      0.206232% (17300)
//   4 ulp:      0.177562% (14895)
//   5 ulp:      0.158906% (13330)
//  >5 ulp:     98.339164% (8249288)

----------------------------------------------------------------------------
starting: bit_nr_s1_2 on [3f800000, 40000000]
// min/max ulp   -2 569
// abs max ulp   569.072379
// correctly:   26.625213% (2233485)
// faithfully:  23.364255% (1959936)
//   2 ulp:      2.729594% (228975)
//   3 ulp:      1.450908% (121711)
//   4 ulp:      1.125371% (94403)
//   5 ulp:      0.922000% (77343)
//  >5 ulp:     43.782658% (3672756)

----------------------------------------------------------------------------
starting: bit_nr_s2_2 on [3f800000, 40000000]
// min/max ulp   -1 568
// abs max ulp   568.072642
// correctly:   36.899193% (3095329)
// faithfully:  13.862108% (1162838)
//   2 ulp:      2.033031% (170543)
//   3 ulp:      1.418936% (119029)
//   4 ulp:      1.106083% (92785)
//   5 ulp:      0.915563% (76803)
//  >5 ulp:     43.765087% (3671282)

----------------------------------------------------------------------------
starting: bit_nr_fp_2 on [3f800000, 40000000]
// min/max ulp   -1 569
// abs max ulp   569.015503
// correctly:   30.147156% (2528927)
// faithfully:  20.481131% (1718082)
//   2 ulp:      2.129233% (178613)
//   3 ulp:      1.432490% (120166)
//   4 ulp:      1.116001% (93617)
//   5 ulp:      0.920248% (77196)
//  >5 ulp:     43.773741% (3672008)

----------------------------------------------------------------------------
starting: bit_nr_f2_2 on [3f800000, 40000000]
// min/max ulp   0 568
// abs max ulp   568.022911
// correctly:   41.278727% (3462711)
// faithfully:   9.518491% (798469)
//   2 ulp:      2.011239% (168715)
//   3 ulp:      1.407313% (118054)
//   4 ulp:      1.107311% (92888)
//   5 ulp:      0.915146% (76768)
//  >5 ulp:     43.761773% (3671004)

----------------------------------------------------------------------------
starting: bit_nr_s1_3 on [3f800000, 40000000]
// min/max ulp   -2 2
// abs max ulp   2.005560
// correctly:   62.444310% (5238209)
// faithfully:  36.752136% (3082993)
//   2 ulp:      0.803554% (67407)

----------------------------------------------------------------------------
starting: bit_nr_s2_3 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   1.248562
// correctly:   83.570685% (7010418)
// faithfully:  16.429315% (1378191)

----------------------------------------------------------------------------
starting: bit_nr_fp_3 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   1.499907
// correctly:   72.277358% (6063065)
// faithfully:  27.722642% (2325544)

----------------------------------------------------------------------------
starting: bit_nr_f2_3 on [3f800000, 40000000]
// min/max ulp   0 1
// abs max ulp   0.518618
// correctly:   99.795926% (8371490)
// faithfully:   0.204074% (17119)

----------------------------------------------------------------------------
starting: bit_hm_1 on [3f800000, 40000000]
// min/max ulp   -29 7446
// abs max ulp   7446.000748
// correctly:    8.088600% (678521)
// faithfully:   4.319632% (362357)
//   2 ulp:      2.463150% (206624)
//   3 ulp:      1.901841% (159538)
//   4 ulp:      1.596606% (133933)
//   5 ulp:      1.409435% (118232)
//  >5 ulp:     80.220737% (6729404)

----------------------------------------------------------------------------
starting: bit_hm_2 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   0.501463
// correctly:   99.984026% (8387269)
// faithfully:   0.015974% (1340)

----------------------------------------------------------------------------
starting: bit_hm_3 on [3f800000, 40000000]
// min/max ulp   0 1
// abs max ulp   0.500000
// correctly:   99.999988% (8388608)
// faithfully:   0.000012% (1)

----------------------------------------------------------------------------
starting: day_rcp_approx on [3f800000, 40000000]
// min/max ulp   -1768 1874
// abs max ulp   1874.000126
// correctly:    0.025952% (2177)
// faithfully:   0.052655% (4417)
//   2 ulp:      0.052273% (4385)
//   3 ulp:      0.052178% (4377)
//   4 ulp:      0.052631% (4415)
//   5 ulp:      0.052297% (4387)
//  >5 ulp:     99.712014% (8364451)

----------------------------------------------------------------------------
starting: day_nr_s1_1 on [3f800000, 40000000]
// min/max ulp   -2 2
// abs max ulp   2.133561
// correctly:   61.393754% (5150082)
// faithfully:  38.129146% (3198505)
//   2 ulp:      0.477099% (40022)

----------------------------------------------------------------------------
starting: day_nr_s2_1 on [3f800000, 40000000]
// min/max ulp   -1 2
// abs max ulp   1.637964
// correctly:   72.490064% (6080908)
// faithfully:  27.491185% (2306128)
//   2 ulp:      0.018752% (1573)

----------------------------------------------------------------------------
starting: day_nr_fp_1 on [3f800000, 40000000]
// min/max ulp   -1 2
// abs max ulp   1.657846
// correctly:   74.109808% (6216782)
// faithfully:  25.886568% (2171523)
//   2 ulp:      0.003624% (304)

----------------------------------------------------------------------------
starting: day_nr_f2_1 on [3f800000, 40000000]
// min/max ulp   0 1
// abs max ulp   0.701917
// correctly:   92.858673% (7789551)
// faithfully:   7.141327% (599058)

----------------------------------------------------------------------------
starting: day_nr_s1_2 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   1.499220
// correctly:   71.204535% (5973070)
// faithfully:  28.795465% (2415539)

----------------------------------------------------------------------------
starting: day_nr_s2_2 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   1.249477
// correctly:   82.748487% (6941447)
// faithfully:  17.251513% (1447162)

----------------------------------------------------------------------------
starting: day_nr_fp_2 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   1.249469
// correctly:   81.148519% (6807232)
// faithfully:  18.851481% (1581377)

----------------------------------------------------------------------------
starting: day_nr_f2_2 on [3f800000, 40000000]
// min/max ulp   0 1
// abs max ulp   0.500000
// correctly:   99.999988% (8388608)
// faithfully:   0.000012% (1)

----------------------------------------------------------------------------
starting: day_hm_1 on [3f800000, 40000000]
// min/max ulp   -1 1
// abs max ulp   0.500096
// correctly:   99.997318% (8388384)
// faithfully:   0.002682% (225)

## recip.c
// Public Domain under http://unlicense.org, see link for details.

// compile with (or equiv):
// clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off recip.c -o recip -lm

#include <stdint.h>
#include <stdio.h>
#include <math.h>
#include <string.h>

#if defined(__FAST_MATH__) || defined(_M_FP_FAST)
#define FP_NAUGHTY_OPTIONS
#endif

// configuration section:

// additionally use MPFR to measure max error (requires -lmpfr to cmdline)
#define USE_MPFR

// run numbers for portable bithacking versions: recip_approx_{n}
#define INCLUDE_BITHACK

// run numbers for ARM inital approximation and NR step opcodes.
// WARNING: I wrote these from the spec and they could be incorrect.
// Too lazy to setup and test on real hardware.
#define INCLUDE_ARM


// to avoid using a compiler extension. sigh.
#define max_ulp_dist 5

#define STRINGIFY(S)  STRINGIFY_(S)
#define STRINGIFY_(S) #S
#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))


float recip_ref(float x) { return 1.f/x; }

//********************************************************
// Newton-Ralphson steps

static inline float nr_step_s1(float y, float x)
{
  return y*(2.f - x*y);
}

static inline float nr_step_s2(float y, float x)
{
  return (y+y) - x*y*y;
}

static inline float nr_step_fp(float y, float x)
{
  return -y*fmaf(x,y,-2.f);
}

static inline float nr_step_f2(float y, float x)
{
  return fmaf(-fmaf(x,y,-1.f), y,y);
}


//********************************************************
// Halley's method

static inline float hm_step(float y, float x)
{
  float t;

  t = fmaf(y, -x, 1.0f);
  y = fmaf(fmaf(t,t,t), y, y);

  return y;
}

//********************************************************
// hardware approximation: SSE

#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
// wrap an intel intrinsic
#include <x86intrin.h>

#define F32_SSE_WRAP(F,X) _mm_cvtss_f32(F(_mm_set_ss(X)))

// return hardware 1/x approx
static inline float sse_rcp_approx(float x)
{
  return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x)));
}

#endif


//********************************************************
// hardware approximation: neon

inline uint32_t f32_to_bits(float x)
{
  uint32_t u; memcpy(&u,&x,4); return u;
}

inline float f32_from_bits(uint32_t x)
{
  float u; memcpy(&u,&x,4); return u;
}

// emulate neon approximation ops (std & not increased precision ones)

// ignore all special cases (including 0): don't
// need them for local testing. written to be eyeball-able to spec
float vrecpe_f32(float x)
{
  // extract the parts
  uint32_t ix = f32_to_bits(x);
  uint32_t sx = ((ix >> 31) & 0x01);
  uint32_t ex = ((ix >> 23) & 0xff);
  uint32_t mx = ((ix      ) & 0x7fffff);
  uint32_t t;

  mx = 0x100|(mx >> 15);    // add implied bit and top 8 explict
  ex = 253-ex;              // adjust the exp

  // SEE: ARM Spec - FPRecipEstimate
  t  = (mx<<1)+1;
  t  = (((1<<19)/t)+1)>>1;
  mx = (t & 0xff) << 15;

  // concat the parts back together
  ix  = (sx & 0x01) << 31;
  ix |= (ex & 0xff) << 23;
  ix |= (mx & 0x7fffff);

  return f32_from_bits(ix);
}

// as the hardware op RECPS
static inline float vrecps_f32(float a, float b)
{
  return -fmaf(a,b,-2.f);
}

// how you'd perform a 'step' with RECPS
static inline float arm_step(float y, float x)
{
  return y*vrecps_f32(x,y);
}


//********************************************************
// bit hack version (well. it's portable!)

static inline float recip_inital_step(uint32_t K, float x)
{
  return f32_from_bits(K-f32_to_bits(x));
}

// magic constant isn't optimal. building an optimal constant requires
// knowing the number and type of correction steps.
static inline float bit_rcp_approx(float x) { return recip_inital_step(0x7eec78f0,x); }


// "Generalising the Fast Reciprocal Square Root Algorithm", Mike Day, 2023
// (PDF: https://arxiv.org/abs/2307.15600)

// listing 8:
static inline float day_rcp_approx(float x)
{
  static const float c0 =  0.6966215f;
  static const float c1 = -0.12130684f;

  float y = recip_inital_step(0x7fb504ec, x);

  // ordering that minimizes >5 ulp results
  return y * fmaf(x,  y*c1, c0);
}


//********************************************************
// let the compiler expand the variants

// expand: 'inital' approximation followed by 'n' rounds of 'step'
inline float simple(float x, float (*initial)(float), float (*step)(float, float), uint32_t n)
{
  float y = initial(x);

  for(uint32_t i=0; i<n; i++)
    y = step(y,x);

  return y;
}


#if defined(INCLUDE_ARM)
// using hardware step
float arm_nr_op_1(float x) { return simple(x, &vrecpe_f32,     &arm_step,   1); }
float arm_nr_op_2(float x) { return simple(x, &vrecpe_f32,     &arm_step,   2); }

// using 2 fma NR step
float arm_nr_f2_1(float x) { return simple(x, &vrecpe_f32,     &nr_step_f2, 1); }
float arm_nr_f2_2(float x) { return simple(x, &vrecpe_f32,     &nr_step_f2, 2); }

float arm_hm_1(float x)    { return simple(x, &vrecpe_f32,     &hm_step,    1); }
#endif

// expand SSE
float sse_nr_s1_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_s1, 1); }
float sse_nr_s2_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_s2, 1); }
float sse_nr_fp_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_fp, 1); }
float sse_nr_f2_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_f2, 1); }
float sse_nr_s1_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_s1, 2); }
float sse_nr_s2_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_s2, 2); }
float sse_nr_fp_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_fp, 2); }
float sse_nr_f2_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_f2, 2); }
float sse_hm_1(float x)    { return simple(x, &sse_rcp_approx, &hm_step,    1); }

#if defined(INCLUDE_BITHACK)
float bit_nr_s1_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 1); }
float bit_nr_s2_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 1); }
float bit_nr_fp_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 1); }
float bit_nr_f2_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 1); }
float bit_nr_s1_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 2); }
float bit_nr_s2_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 2); }
float bit_nr_fp_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 2); }
float bit_nr_f2_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 2); }
float bit_nr_s1_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 3); }
float bit_nr_s2_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 3); }
float bit_nr_fp_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 3); }
float bit_nr_f2_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 3); }
float bit_hm_1(float x)    { return simple(x, &bit_rcp_approx, &hm_step,    1); }
float bit_hm_2(float x)    { return simple(x, &bit_rcp_approx, &hm_step,    2); }
float bit_hm_3(float x)    { return simple(x, &bit_rcp_approx, &hm_step,    3); }
#endif

float day_nr_s1_1(float x) { return simple(x, &day_rcp_approx, &nr_step_s1, 1); }
float day_nr_s2_1(float x) { return simple(x, &day_rcp_approx, &nr_step_s2, 1); }
float day_nr_fp_1(float x) { return simple(x, &day_rcp_approx, &nr_step_fp, 1); }
float day_nr_f2_1(float x) { return simple(x, &day_rcp_approx, &nr_step_f2, 1); }
float day_nr_s1_2(float x) { return simple(x, &day_rcp_approx, &nr_step_s1, 2); }
float day_nr_s2_2(float x) { return simple(x, &day_rcp_approx, &nr_step_s2, 2); }
float day_nr_fp_2(float x) { return simple(x, &day_rcp_approx, &nr_step_fp, 2); }
float day_nr_f2_2(float x) { return simple(x, &day_rcp_approx, &nr_step_f2, 2); }
float day_hm_1(float x)    { return simple(x, &day_rcp_approx, &hm_step,    1); }


//********************************************************


typedef struct {
  float (*f)(float);
  char*   name;
} recip_entry_t;

#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }

recip_entry_t recip_table[] =
{
#if defined(INCLUDE_ARM)
  ENTRY(vrecpe_f32),
  ENTRY(arm_nr_op_1),
  ENTRY(arm_nr_f2_1),
  ENTRY(arm_nr_op_2),
  ENTRY(arm_nr_f2_2),
  ENTRY(arm_hm_1),
#endif

  // SSE expansions
  ENTRY(sse_rcp_approx),
  ENTRY(sse_nr_s1_1),
  ENTRY(sse_nr_s2_1),
  ENTRY(sse_nr_fp_1),
  ENTRY(sse_nr_f2_1),
  ENTRY(sse_nr_s1_2),
  ENTRY(sse_nr_s2_2),
  ENTRY(sse_nr_fp_2),
  ENTRY(sse_nr_f2_2),
  ENTRY(sse_hm_1),

#if defined(INCLUDE_BITHACK)
  ENTRY(bit_rcp_approx),
  ENTRY(bit_nr_s1_1),
  ENTRY(bit_nr_s2_1),
  ENTRY(bit_nr_fp_1),
  ENTRY(bit_nr_f2_1),
  ENTRY(bit_nr_s1_2),
  ENTRY(bit_nr_s2_2),
  ENTRY(bit_nr_fp_2),
  ENTRY(bit_nr_f2_2),
  ENTRY(bit_nr_s1_3),
  ENTRY(bit_nr_s2_3),
  ENTRY(bit_nr_fp_3),
  ENTRY(bit_nr_f2_3),
  ENTRY(bit_hm_1),
  ENTRY(bit_hm_2),
  ENTRY(bit_hm_3),
#endif

  ENTRY(day_rcp_approx),
  ENTRY(day_nr_s1_1),
  ENTRY(day_nr_s2_1),
  ENTRY(day_nr_fp_1),
  ENTRY(day_nr_f2_1),
  ENTRY(day_nr_s1_2),
  ENTRY(day_nr_s2_2),
  ENTRY(day_nr_fp_2),
  ENTRY(day_nr_f2_2),
  ENTRY(day_hm_1)
};

#undef ENTRY

//********************************************************

inline uint32_t u32_abs(uint32_t x)
{
  return (int32_t)x >= 0 ? x : -x;
}

// signed distance: a & b must have same sign
inline int32_t f32_ulp_sdist_ss(float a, float b)
{
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);
  return (int32_t)(ua-ub);
}

#if defined(USE_MPFR)
#include <mpfr.h>
#endif

void accuracy_test(recip_entry_t* entry)
{
  const uint32_t xs = f32_to_bits(1.f);
  const uint32_t xe = f32_to_bits(2.f);
  const uint32_t total = xe-xs+1;

#if defined(USE_MPFR)
  mpfr_t m1,mx,mr;
  double error_u = 0.0; // in rounding units

  mpfr_init2(m1, 24);
  mpfr_init2(mx, 24);
  mpfr_init2(mr, 128);

  mpfr_set_flt(m1, 1.f, MPFR_RNDN);
#endif

  int32_t  error_max = 0;
  int32_t  error_min = 0;
  uint32_t error_cnt = 0;

  printf("\n----------------------------------------------------------------------------\n");
  printf("starting: %s on [%08x, %08x]\n", entry->name, xs,xe);

  float (*f)(float) = entry->f;

  uint32_t error[max_ulp_dist+1] = {0};

  for(uint32_t xi=xs; xi<=xe; xi++) {
    float x  = f32_from_bits(xi);
    float cr = 1.f/x;
    float r0 = f(x);

    if (r0 == cr)
      error[0]++;
    else {
      int32_t u = f32_ulp_sdist_ss(cr,r0);

      if (u > error_max) error_max = u;
      if (u < error_min) error_min = u;

      uint32_t u0 = u32_abs((uint32_t)u);

      // track it if small enough for the buckets
      if (u0 <= max_ulp_dist)
	error[u0]++;
      else
	error_cnt++;

#if defined(USE_MPFR)
      // recompute in MP
      mpfr_set_flt(mx, x,  MPFR_RNDN);
      mpfr_div(mr,m1,mx, MPFR_RNDN);

      // absolute error
      mpfr_set_flt(mx, r0,  MPFR_RNDN);
      mpfr_sub(mr,mr,mx, MPFR_RNDA);

      // WARNING: cheating: only works on (1,2) - the
      // product. should be correct by exp of result
      double merror = 0x1p24f*fabs(mpfr_get_d(mr, MPFR_RNDA));

      if (merror > error_u) error_u = merror;

#endif
    }
  }


  printf("// min/max ulp   %d %d\n", error_min, error_max);
#if defined(USE_MPFR)
  printf("// abs max ulp   %f\n", error_u);
#endif
  printf("// correctly:  %10f%% (%d)\n", 100.0*(double)(error[0])/(double)(total), error[0]);

  if (error[1])
    printf("// faithfully: %10f%% (%d)\n", 100.0*(double)(error[1])/(double)(total), error[1]);

  for(uint32_t i=2; i<=max_ulp_dist; i++)
    if (error[i])
      printf("// %3d ulp:    %10f%% (%d)\n", i,100.0*(double)(error[i])/(double)(total), error[i]);

  if (error_cnt)
    printf("//  >%d ulp:    %10f%% (%d)\n", max_ulp_dist,100.0*(double)(error_cnt)/(double)(total), error_cnt);

}

//********************************************************

int main(void)
{
#if !defined(FP_NAUGHTY_OPTIONS)
  for(uint32_t i=0; i<LENGTHOF(recip_table); i++)
    accuracy_test(recip_table+i);
#else
  printf("LOL! No\n");
#endif
  return 0;
}
	----------------------------------------------------------------------------
	starting: vrecpe_f32 on [3f800000, 40000000]
	// min/max ulp -45502 44978
	// abs max ulp 45502.375051
	// correctly: 0.002611% (219)
	// faithfully: 0.005555% (466)
	// 2 ulp: 0.005436% (456)
	// 3 ulp: 0.005460% (458)
	// 4 ulp: 0.005329% (447)
	// 5 ulp: 0.005531% (464)
	// >5 ulp: 99.970078% (8386099)

	----------------------------------------------------------------------------
	starting: arm_nr_op_1 on [3f800000, 40000000]
	// min/max ulp -1 129
	// abs max ulp 128.903094
	// correctly: 10.676442% (895605)
	// faithfully: 11.139952% (934487)
	// 2 ulp: 6.347620% (532477)
	// 3 ulp: 4.953718% (415548)
	// 4 ulp: 4.155027% (348549)
	// 5 ulp: 3.647220% (305951)
	// >5 ulp: 59.080021% (4955992)

	----------------------------------------------------------------------------
	starting: arm_nr_f2_1 on [3f800000, 40000000]
	// min/max ulp 0 128
	// abs max ulp 128.090785
	// correctly: 12.101339% (1015134)
	// faithfully: 9.859215% (827051)
	// 2 ulp: 6.272411% (526168)
	// 3 ulp: 4.919719% (412696)
	// 4 ulp: 4.147815% (347944)
	// 5 ulp: 3.636097% (305018)
	// >5 ulp: 59.063404% (4954598)

	----------------------------------------------------------------------------
	starting: arm_nr_op_2 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 1.487251
	// correctly: 67.627648% (5673019)
	// faithfully: 32.372352% (2715590)

	----------------------------------------------------------------------------
	starting: arm_nr_f2_2 on [3f800000, 40000000]
	// min/max ulp 0 1
	// abs max ulp 0.500814
	// correctly: 99.994457% (8388144)
	// faithfully: 0.005543% (465)

	----------------------------------------------------------------------------
	starting: arm_hm_1 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 0.855433
	// correctly: 97.301376% (8162232)
	// faithfully: 2.698624% (226377)

	----------------------------------------------------------------------------
	starting: sse_rcp_approx on [3f800000, 40000000]
	// min/max ulp -4996 4989
	// abs max ulp 4995.549501
	// correctly: 0.028241% (2369)
	// faithfully: 0.057113% (4791)
	// 2 ulp: 0.056624% (4750)
	// 3 ulp: 0.057054% (4786)
	// 4 ulp: 0.056839% (4768)
	// 5 ulp: 0.057185% (4797)
	// >5 ulp: 99.686945% (8362348)

	----------------------------------------------------------------------------
	starting: sse_nr_s1_1 on [3f800000, 40000000]
	// min/max ulp -2 3
	// abs max ulp 3.327252
	// correctly: 58.715444% (4925409)
	// faithfully: 39.751287% (3334580)
	// 2 ulp: 1.516497% (127213)
	// 3 ulp: 0.016773% (1407)

	----------------------------------------------------------------------------
	starting: sse_nr_s2_1 on [3f800000, 40000000]
	// min/max ulp -1 3
	// abs max ulp 2.756861
	// correctly: 68.920032% (5781432)
	// faithfully: 30.598839% (2566817)
	// 2 ulp: 0.480151% (40278)
	// 3 ulp: 0.000978% (82)

	----------------------------------------------------------------------------
	starting: sse_nr_fp_1 on [3f800000, 40000000]
	// min/max ulp -1 3
	// abs max ulp 2.848300
	// correctly: 68.949548% (5783908)
	// faithfully: 30.556294% (2563248)
	// 2 ulp: 0.493121% (41366)
	// 3 ulp: 0.001037% (87)

	----------------------------------------------------------------------------
	starting: sse_nr_f2_1 on [3f800000, 40000000]
	// min/max ulp 0 2
	// abs max ulp 1.960132
	// correctly: 84.322967% (7073524)
	// faithfully: 15.620623% (1310353)
	// 2 ulp: 0.056410% (4732)

	----------------------------------------------------------------------------
	starting: sse_nr_s1_2 on [3f800000, 40000000]
	// min/max ulp -2 1
	// abs max ulp 1.811487
	// correctly: 70.040599% (5875432)
	// faithfully: 29.943165% (2511815)
	// 2 ulp: 0.016236% (1362)

	----------------------------------------------------------------------------
	starting: sse_nr_s2_2 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 1.248996
	// correctly: 83.297946% (6987539)
	// faithfully: 16.702054% (1401070)

	----------------------------------------------------------------------------
	starting: sse_nr_fp_2 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 1.249992
	// correctly: 78.509786% (6585879)
	// faithfully: 21.490214% (1802730)

	----------------------------------------------------------------------------
	starting: sse_nr_f2_2 on [3f800000, 40000000]
	// min/max ulp 0 1
	// abs max ulp 0.500000
	// correctly: 99.999988% (8388608)
	// faithfully: 0.000012% (1)

	----------------------------------------------------------------------------
	starting: sse_hm_1 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 0.500378
	// correctly: 99.996150% (8388286)
	// faithfully: 0.003850% (323)

	----------------------------------------------------------------------------
	starting: bit_rcp_approx on [3f800000, 40000000]
	// min/max ulp -159498 1279760
	// abs max ulp 1279760.000000
	// correctly: 0.000107% (9)
	// faithfully: 0.000203% (17)
	// 2 ulp: 0.000203% (17)
	// 3 ulp: 0.000215% (18)
	// 4 ulp: 0.000203% (17)
	// 5 ulp: 0.000203% (17)
	// >5 ulp: 99.998868% (8388514)

	----------------------------------------------------------------------------
	starting: bit_nr_s1_1 on [3f800000, 40000000]
	// min/max ulp -2 97621
	// abs max ulp 97621.000002
	// correctly: 0.375867% (31530)
	// faithfully: 0.473118% (39688)
	// 2 ulp: 0.263309% (22088)
	// 3 ulp: 0.208855% (17520)
	// 4 ulp: 0.178814% (15000)
	// 5 ulp: 0.159001% (13338)
	// >5 ulp: 98.341036% (8249445)

	----------------------------------------------------------------------------
	starting: bit_nr_s2_1 on [3f800000, 40000000]
	// min/max ulp -1 97620
	// abs max ulp 97620.000002
	// correctly: 0.414848% (34800)
	// faithfully: 0.442111% (37087)
	// 2 ulp: 0.258505% (21685)
	// 3 ulp: 0.207484% (17405)
	// 4 ulp: 0.178170% (14946)
	// 5 ulp: 0.159025% (13340)
	// >5 ulp: 98.339856% (8249346)

	----------------------------------------------------------------------------
	starting: bit_nr_fp_1 on [3f800000, 40000000]
	// min/max ulp -1 97620
	// abs max ulp 97620.000009
	// correctly: 0.414467% (34768)
	// faithfully: 0.442910% (37154)
	// 2 ulp: 0.257874% (21632)
	// 3 ulp: 0.207269% (17387)
	// 4 ulp: 0.178623% (14984)
	// 5 ulp: 0.158823% (13323)
	// >5 ulp: 98.340035% (8249361)

	----------------------------------------------------------------------------
	starting: bit_nr_f2_1 on [3f800000, 40000000]
	// min/max ulp 0 97620
	// abs max ulp 97620.000000
	// correctly: 0.472355% (39624)
	// faithfully: 0.391078% (32806)
	// 2 ulp: 0.254703% (21366)
	// 3 ulp: 0.206232% (17300)
	// 4 ulp: 0.177562% (14895)
	// 5 ulp: 0.158906% (13330)
	// >5 ulp: 98.339164% (8249288)

	----------------------------------------------------------------------------
	starting: bit_nr_s1_2 on [3f800000, 40000000]
	// min/max ulp -2 569
	// abs max ulp 569.072379
	// correctly: 26.625213% (2233485)
	// faithfully: 23.364255% (1959936)
	// 2 ulp: 2.729594% (228975)
	// 3 ulp: 1.450908% (121711)
	// 4 ulp: 1.125371% (94403)
	// 5 ulp: 0.922000% (77343)
	// >5 ulp: 43.782658% (3672756)

	----------------------------------------------------------------------------
	starting: bit_nr_s2_2 on [3f800000, 40000000]
	// min/max ulp -1 568
	// abs max ulp 568.072642
	// correctly: 36.899193% (3095329)
	// faithfully: 13.862108% (1162838)
	// 2 ulp: 2.033031% (170543)
	// 3 ulp: 1.418936% (119029)
	// 4 ulp: 1.106083% (92785)
	// 5 ulp: 0.915563% (76803)
	// >5 ulp: 43.765087% (3671282)

	----------------------------------------------------------------------------
	starting: bit_nr_fp_2 on [3f800000, 40000000]
	// min/max ulp -1 569
	// abs max ulp 569.015503
	// correctly: 30.147156% (2528927)
	// faithfully: 20.481131% (1718082)
	// 2 ulp: 2.129233% (178613)
	// 3 ulp: 1.432490% (120166)
	// 4 ulp: 1.116001% (93617)
	// 5 ulp: 0.920248% (77196)
	// >5 ulp: 43.773741% (3672008)

	----------------------------------------------------------------------------
	starting: bit_nr_f2_2 on [3f800000, 40000000]
	// min/max ulp 0 568
	// abs max ulp 568.022911
	// correctly: 41.278727% (3462711)
	// faithfully: 9.518491% (798469)
	// 2 ulp: 2.011239% (168715)
	// 3 ulp: 1.407313% (118054)
	// 4 ulp: 1.107311% (92888)
	// 5 ulp: 0.915146% (76768)
	// >5 ulp: 43.761773% (3671004)

	----------------------------------------------------------------------------
	starting: bit_nr_s1_3 on [3f800000, 40000000]
	// min/max ulp -2 2
	// abs max ulp 2.005560
	// correctly: 62.444310% (5238209)
	// faithfully: 36.752136% (3082993)
	// 2 ulp: 0.803554% (67407)

	----------------------------------------------------------------------------
	starting: bit_nr_s2_3 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 1.248562
	// correctly: 83.570685% (7010418)
	// faithfully: 16.429315% (1378191)

	----------------------------------------------------------------------------
	starting: bit_nr_fp_3 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 1.499907
	// correctly: 72.277358% (6063065)
	// faithfully: 27.722642% (2325544)

	----------------------------------------------------------------------------
	starting: bit_nr_f2_3 on [3f800000, 40000000]
	// min/max ulp 0 1
	// abs max ulp 0.518618
	// correctly: 99.795926% (8371490)
	// faithfully: 0.204074% (17119)

	----------------------------------------------------------------------------
	starting: bit_hm_1 on [3f800000, 40000000]
	// min/max ulp -29 7446
	// abs max ulp 7446.000748
	// correctly: 8.088600% (678521)
	// faithfully: 4.319632% (362357)
	// 2 ulp: 2.463150% (206624)
	// 3 ulp: 1.901841% (159538)
	// 4 ulp: 1.596606% (133933)
	// 5 ulp: 1.409435% (118232)
	// >5 ulp: 80.220737% (6729404)

	----------------------------------------------------------------------------
	starting: bit_hm_2 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 0.501463
	// correctly: 99.984026% (8387269)
	// faithfully: 0.015974% (1340)

	----------------------------------------------------------------------------
	starting: bit_hm_3 on [3f800000, 40000000]
	// min/max ulp 0 1
	// abs max ulp 0.500000
	// correctly: 99.999988% (8388608)
	// faithfully: 0.000012% (1)

	----------------------------------------------------------------------------
	starting: day_rcp_approx on [3f800000, 40000000]
	// min/max ulp -1768 1874
	// abs max ulp 1874.000126
	// correctly: 0.025952% (2177)
	// faithfully: 0.052655% (4417)
	// 2 ulp: 0.052273% (4385)
	// 3 ulp: 0.052178% (4377)
	// 4 ulp: 0.052631% (4415)
	// 5 ulp: 0.052297% (4387)
	// >5 ulp: 99.712014% (8364451)

	----------------------------------------------------------------------------
	starting: day_nr_s1_1 on [3f800000, 40000000]
	// min/max ulp -2 2
	// abs max ulp 2.133561
	// correctly: 61.393754% (5150082)
	// faithfully: 38.129146% (3198505)
	// 2 ulp: 0.477099% (40022)

	----------------------------------------------------------------------------
	starting: day_nr_s2_1 on [3f800000, 40000000]
	// min/max ulp -1 2
	// abs max ulp 1.637964
	// correctly: 72.490064% (6080908)
	// faithfully: 27.491185% (2306128)
	// 2 ulp: 0.018752% (1573)

	----------------------------------------------------------------------------
	starting: day_nr_fp_1 on [3f800000, 40000000]
	// min/max ulp -1 2
	// abs max ulp 1.657846
	// correctly: 74.109808% (6216782)
	// faithfully: 25.886568% (2171523)
	// 2 ulp: 0.003624% (304)

	----------------------------------------------------------------------------
	starting: day_nr_f2_1 on [3f800000, 40000000]
	// min/max ulp 0 1
	// abs max ulp 0.701917
	// correctly: 92.858673% (7789551)
	// faithfully: 7.141327% (599058)

	----------------------------------------------------------------------------
	starting: day_nr_s1_2 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 1.499220
	// correctly: 71.204535% (5973070)
	// faithfully: 28.795465% (2415539)

	----------------------------------------------------------------------------
	starting: day_nr_s2_2 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 1.249477
	// correctly: 82.748487% (6941447)
	// faithfully: 17.251513% (1447162)

	----------------------------------------------------------------------------
	starting: day_nr_fp_2 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 1.249469
	// correctly: 81.148519% (6807232)
	// faithfully: 18.851481% (1581377)

	----------------------------------------------------------------------------
	starting: day_nr_f2_2 on [3f800000, 40000000]
	// min/max ulp 0 1
	// abs max ulp 0.500000
	// correctly: 99.999988% (8388608)
	// faithfully: 0.000012% (1)

	----------------------------------------------------------------------------
	starting: day_hm_1 on [3f800000, 40000000]
	// min/max ulp -1 1
	// abs max ulp 0.500096
	// correctly: 99.997318% (8388384)
	// faithfully: 0.002682% (225)
	// Public Domain under http://unlicense.org, see link for details.

	// compile with (or equiv):
	// clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off recip.c -o recip -lm

	#include <stdint.h>
	#include <stdio.h>
	#include <math.h>
	#include <string.h>

	#if defined(__FAST_MATH__) \|\| defined(_M_FP_FAST)
	#define FP_NAUGHTY_OPTIONS
	#endif

	// configuration section:

	// additionally use MPFR to measure max error (requires -lmpfr to cmdline)
	#define USE_MPFR

	// run numbers for portable bithacking versions: recip_approx_{n}
	#define INCLUDE_BITHACK

	// run numbers for ARM inital approximation and NR step opcodes.
	// WARNING: I wrote these from the spec and they could be incorrect.
	// Too lazy to setup and test on real hardware.
	#define INCLUDE_ARM


	// to avoid using a compiler extension. sigh.
	#define max_ulp_dist 5

	#define STRINGIFY(S) STRINGIFY_(S)
	#define STRINGIFY_(S) #S
	#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))


	float recip_ref(float x) { return 1.f/x; }

	//********************************************************
	// Newton-Ralphson steps

	static inline float nr_step_s1(float y, float x)
	{
	return y(2.f - xy);
	}

	static inline float nr_step_s2(float y, float x)
	{
	return (y+y) - xyy;
	}

	static inline float nr_step_fp(float y, float x)
	{
	return -y*fmaf(x,y,-2.f);
	}

	static inline float nr_step_f2(float y, float x)
	{
	return fmaf(-fmaf(x,y,-1.f), y,y);
	}


	//********************************************************
	// Halley's method

	static inline float hm_step(float y, float x)
	{
	float t;

	t = fmaf(y, -x, 1.0f);
	y = fmaf(fmaf(t,t,t), y, y);

	return y;
	}

	//********************************************************
	// hardware approximation: SSE

	#if defined(__x86_64__) \|\| defined(__x86_64) \|\| defined(_M_X64) \|\| defined(_M_AMD64)
	// wrap an intel intrinsic
	#include <x86intrin.h>

	#define F32_SSE_WRAP(F,X) _mm_cvtss_f32(F(_mm_set_ss(X)))

	// return hardware 1/x approx
	static inline float sse_rcp_approx(float x)
	{
	return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x)));
	}

	#endif


	//********************************************************
	// hardware approximation: neon

	inline uint32_t f32_to_bits(float x)
	{
	uint32_t u; memcpy(&u,&x,4); return u;
	}

	inline float f32_from_bits(uint32_t x)
	{
	float u; memcpy(&u,&x,4); return u;
	}

	// emulate neon approximation ops (std & not increased precision ones)

	// ignore all special cases (including 0): don't
	// need them for local testing. written to be eyeball-able to spec
	float vrecpe_f32(float x)
	{
	// extract the parts
	uint32_t ix = f32_to_bits(x);
	uint32_t sx = ((ix >> 31) & 0x01);
	uint32_t ex = ((ix >> 23) & 0xff);
	uint32_t mx = ((ix ) & 0x7fffff);
	uint32_t t;

	mx = 0x100\|(mx >> 15); // add implied bit and top 8 explict
	ex = 253-ex; // adjust the exp

	// SEE: ARM Spec - FPRecipEstimate
	t = (mx<<1)+1;
	t = (((1<<19)/t)+1)>>1;
	mx = (t & 0xff) << 15;

	// concat the parts back together
	ix = (sx & 0x01) << 31;
	ix \|= (ex & 0xff) << 23;
	ix \|= (mx & 0x7fffff);

	return f32_from_bits(ix);
	}

	// as the hardware op RECPS
	static inline float vrecps_f32(float a, float b)
	{
	return -fmaf(a,b,-2.f);
	}

	// how you'd perform a 'step' with RECPS
	static inline float arm_step(float y, float x)
	{
	return y*vrecps_f32(x,y);
	}


	//********************************************************
	// bit hack version (well. it's portable!)

	static inline float recip_inital_step(uint32_t K, float x)
	{
	return f32_from_bits(K-f32_to_bits(x));
	}

	// magic constant isn't optimal. building an optimal constant requires
	// knowing the number and type of correction steps.
	static inline float bit_rcp_approx(float x) { return recip_inital_step(0x7eec78f0,x); }


	// "Generalising the Fast Reciprocal Square Root Algorithm", Mike Day, 2023
	// (PDF: https://arxiv.org/abs/2307.15600)

	// listing 8:
	static inline float day_rcp_approx(float x)
	{
	static const float c0 = 0.6966215f;
	static const float c1 = -0.12130684f;

	float y = recip_inital_step(0x7fb504ec, x);

	// ordering that minimizes >5 ulp results
	return y * fmaf(x, y*c1, c0);
	}


	//********************************************************
	// let the compiler expand the variants

	// expand: 'inital' approximation followed by 'n' rounds of 'step'
	inline float simple(float x, float (initial)(float), float (step)(float, float), uint32_t n)
	{
	float y = initial(x);

	for(uint32_t i=0; i<n; i++)
	y = step(y,x);

	return y;
	}


	#if defined(INCLUDE_ARM)
	// using hardware step
	float arm_nr_op_1(float x) { return simple(x, &vrecpe_f32, &arm_step, 1); }
	float arm_nr_op_2(float x) { return simple(x, &vrecpe_f32, &arm_step, 2); }

	// using 2 fma NR step
	float arm_nr_f2_1(float x) { return simple(x, &vrecpe_f32, &nr_step_f2, 1); }
	float arm_nr_f2_2(float x) { return simple(x, &vrecpe_f32, &nr_step_f2, 2); }

	float arm_hm_1(float x) { return simple(x, &vrecpe_f32, &hm_step, 1); }
	#endif

	// expand SSE
	float sse_nr_s1_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_s1, 1); }
	float sse_nr_s2_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_s2, 1); }
	float sse_nr_fp_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_fp, 1); }
	float sse_nr_f2_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_f2, 1); }
	float sse_nr_s1_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_s1, 2); }
	float sse_nr_s2_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_s2, 2); }
	float sse_nr_fp_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_fp, 2); }
	float sse_nr_f2_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_f2, 2); }
	float sse_hm_1(float x) { return simple(x, &sse_rcp_approx, &hm_step, 1); }

	#if defined(INCLUDE_BITHACK)
	float bit_nr_s1_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 1); }
	float bit_nr_s2_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 1); }
	float bit_nr_fp_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 1); }
	float bit_nr_f2_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 1); }
	float bit_nr_s1_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 2); }
	float bit_nr_s2_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 2); }
	float bit_nr_fp_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 2); }
	float bit_nr_f2_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 2); }
	float bit_nr_s1_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 3); }
	float bit_nr_s2_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 3); }
	float bit_nr_fp_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 3); }
	float bit_nr_f2_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 3); }
	float bit_hm_1(float x) { return simple(x, &bit_rcp_approx, &hm_step, 1); }
	float bit_hm_2(float x) { return simple(x, &bit_rcp_approx, &hm_step, 2); }
	float bit_hm_3(float x) { return simple(x, &bit_rcp_approx, &hm_step, 3); }
	#endif

	float day_nr_s1_1(float x) { return simple(x, &day_rcp_approx, &nr_step_s1, 1); }
	float day_nr_s2_1(float x) { return simple(x, &day_rcp_approx, &nr_step_s2, 1); }
	float day_nr_fp_1(float x) { return simple(x, &day_rcp_approx, &nr_step_fp, 1); }
	float day_nr_f2_1(float x) { return simple(x, &day_rcp_approx, &nr_step_f2, 1); }
	float day_nr_s1_2(float x) { return simple(x, &day_rcp_approx, &nr_step_s1, 2); }
	float day_nr_s2_2(float x) { return simple(x, &day_rcp_approx, &nr_step_s2, 2); }
	float day_nr_fp_2(float x) { return simple(x, &day_rcp_approx, &nr_step_fp, 2); }
	float day_nr_f2_2(float x) { return simple(x, &day_rcp_approx, &nr_step_f2, 2); }
	float day_hm_1(float x) { return simple(x, &day_rcp_approx, &hm_step, 1); }


	//********************************************************


	typedef struct {
	float (*f)(float);
	char* name;
	} recip_entry_t;

	#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }

	recip_entry_t recip_table[] =
	{
	#if defined(INCLUDE_ARM)
	ENTRY(vrecpe_f32),
	ENTRY(arm_nr_op_1),
	ENTRY(arm_nr_f2_1),
	ENTRY(arm_nr_op_2),
	ENTRY(arm_nr_f2_2),
	ENTRY(arm_hm_1),
	#endif

	// SSE expansions
	ENTRY(sse_rcp_approx),
	ENTRY(sse_nr_s1_1),
	ENTRY(sse_nr_s2_1),
	ENTRY(sse_nr_fp_1),
	ENTRY(sse_nr_f2_1),
	ENTRY(sse_nr_s1_2),
	ENTRY(sse_nr_s2_2),
	ENTRY(sse_nr_fp_2),
	ENTRY(sse_nr_f2_2),
	ENTRY(sse_hm_1),

	#if defined(INCLUDE_BITHACK)
	ENTRY(bit_rcp_approx),
	ENTRY(bit_nr_s1_1),
	ENTRY(bit_nr_s2_1),
	ENTRY(bit_nr_fp_1),
	ENTRY(bit_nr_f2_1),
	ENTRY(bit_nr_s1_2),
	ENTRY(bit_nr_s2_2),
	ENTRY(bit_nr_fp_2),
	ENTRY(bit_nr_f2_2),
	ENTRY(bit_nr_s1_3),
	ENTRY(bit_nr_s2_3),
	ENTRY(bit_nr_fp_3),
	ENTRY(bit_nr_f2_3),
	ENTRY(bit_hm_1),
	ENTRY(bit_hm_2),
	ENTRY(bit_hm_3),
	#endif

	ENTRY(day_rcp_approx),
	ENTRY(day_nr_s1_1),
	ENTRY(day_nr_s2_1),
	ENTRY(day_nr_fp_1),
	ENTRY(day_nr_f2_1),
	ENTRY(day_nr_s1_2),
	ENTRY(day_nr_s2_2),
	ENTRY(day_nr_fp_2),
	ENTRY(day_nr_f2_2),
	ENTRY(day_hm_1)
	};

	#undef ENTRY

	//********************************************************

	inline uint32_t u32_abs(uint32_t x)
	{
	return (int32_t)x >= 0 ? x : -x;
	}

	// signed distance: a & b must have same sign
	inline int32_t f32_ulp_sdist_ss(float a, float b)
	{
	uint32_t ua = f32_to_bits(a);
	uint32_t ub = f32_to_bits(b);
	return (int32_t)(ua-ub);
	}

	#if defined(USE_MPFR)
	#include <mpfr.h>
	#endif

	void accuracy_test(recip_entry_t* entry)
	{
	const uint32_t xs = f32_to_bits(1.f);
	const uint32_t xe = f32_to_bits(2.f);
	const uint32_t total = xe-xs+1;

	#if defined(USE_MPFR)
	mpfr_t m1,mx,mr;
	double error_u = 0.0; // in rounding units

	mpfr_init2(m1, 24);
	mpfr_init2(mx, 24);
	mpfr_init2(mr, 128);

	mpfr_set_flt(m1, 1.f, MPFR_RNDN);
	#endif

	int32_t error_max = 0;
	int32_t error_min = 0;
	uint32_t error_cnt = 0;

	printf("\n----------------------------------------------------------------------------\n");
	printf("starting: %s on [%08x, %08x]\n", entry->name, xs,xe);

	float (*f)(float) = entry->f;

	uint32_t error[max_ulp_dist+1] = {0};

	for(uint32_t xi=xs; xi<=xe; xi++) {
	float x = f32_from_bits(xi);
	float cr = 1.f/x;
	float r0 = f(x);

	if (r0 == cr)
	error[0]++;
	else {
	int32_t u = f32_ulp_sdist_ss(cr,r0);

	if (u > error_max) error_max = u;
	if (u < error_min) error_min = u;

	uint32_t u0 = u32_abs((uint32_t)u);

	// track it if small enough for the buckets
	if (u0 <= max_ulp_dist)
	error[u0]++;
	else
	error_cnt++;

	#if defined(USE_MPFR)
	// recompute in MP
	mpfr_set_flt(mx, x, MPFR_RNDN);
	mpfr_div(mr,m1,mx, MPFR_RNDN);

	// absolute error
	mpfr_set_flt(mx, r0, MPFR_RNDN);
	mpfr_sub(mr,mr,mx, MPFR_RNDA);

	// WARNING: cheating: only works on (1,2) - the
	// product. should be correct by exp of result
	double merror = 0x1p24f*fabs(mpfr_get_d(mr, MPFR_RNDA));

	if (merror > error_u) error_u = merror;

	#endif
	}
	}


	printf("// min/max ulp %d %d\n", error_min, error_max);
	#if defined(USE_MPFR)
	printf("// abs max ulp %f\n", error_u);
	#endif
	printf("// correctly: %10f%% (%d)\n", 100.0*(double)(error[0])/(double)(total), error[0]);

	if (error[1])
	printf("// faithfully: %10f%% (%d)\n", 100.0*(double)(error[1])/(double)(total), error[1]);

	for(uint32_t i=2; i<=max_ulp_dist; i++)
	if (error[i])
	printf("// %3d ulp: %10f%% (%d)\n", i,100.0*(double)(error[i])/(double)(total), error[i]);

	if (error_cnt)
	printf("// >%d ulp: %10f%% (%d)\n", max_ulp_dist,100.0*(double)(error_cnt)/(double)(total), error_cnt);

	}

	//********************************************************

	int main(void)
	{
	#if !defined(FP_NAUGHTY_OPTIONS)
	for(uint32_t i=0; i<LENGTHOF(recip_table); i++)
	accuracy_test(recip_table+i);
	#else
	printf("LOL! No\n");
	#endif
	return 0;
	}