Skip to content

Instantly share code, notes, and snippets.

@Marc-B-Reynolds
Last active August 7, 2023 13:20
Show Gist options
  • Save Marc-B-Reynolds/bab0efc00c55be52f8663a762131ada2 to your computer and use it in GitHub Desktop.
Save Marc-B-Reynolds/bab0efc00c55be52f8663a762131ada2 to your computer and use it in GitHub Desktop.
Error bound checks for 1/x approximation: sse, arm and hacky initial value
----------------------------------------------------------------------------
starting: vrecpe_f32 on [3f800000, 40000000]
// min/max ulp -45502 44978
// abs max ulp 45502.375051
// correctly: 0.002611% (219)
// faithfully: 0.005555% (466)
// 2 ulp: 0.005436% (456)
// 3 ulp: 0.005460% (458)
// 4 ulp: 0.005329% (447)
// 5 ulp: 0.005531% (464)
// >5 ulp: 99.970078% (8386099)
----------------------------------------------------------------------------
starting: arm_nr_op_1 on [3f800000, 40000000]
// min/max ulp -1 129
// abs max ulp 128.903094
// correctly: 10.676442% (895605)
// faithfully: 11.139952% (934487)
// 2 ulp: 6.347620% (532477)
// 3 ulp: 4.953718% (415548)
// 4 ulp: 4.155027% (348549)
// 5 ulp: 3.647220% (305951)
// >5 ulp: 59.080021% (4955992)
----------------------------------------------------------------------------
starting: arm_nr_f2_1 on [3f800000, 40000000]
// min/max ulp 0 128
// abs max ulp 128.090785
// correctly: 12.101339% (1015134)
// faithfully: 9.859215% (827051)
// 2 ulp: 6.272411% (526168)
// 3 ulp: 4.919719% (412696)
// 4 ulp: 4.147815% (347944)
// 5 ulp: 3.636097% (305018)
// >5 ulp: 59.063404% (4954598)
----------------------------------------------------------------------------
starting: arm_nr_op_2 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 1.487251
// correctly: 67.627648% (5673019)
// faithfully: 32.372352% (2715590)
----------------------------------------------------------------------------
starting: arm_nr_f2_2 on [3f800000, 40000000]
// min/max ulp 0 1
// abs max ulp 0.500814
// correctly: 99.994457% (8388144)
// faithfully: 0.005543% (465)
----------------------------------------------------------------------------
starting: arm_hm_1 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 0.855433
// correctly: 97.301376% (8162232)
// faithfully: 2.698624% (226377)
----------------------------------------------------------------------------
starting: sse_rcp_approx on [3f800000, 40000000]
// min/max ulp -4996 4989
// abs max ulp 4995.549501
// correctly: 0.028241% (2369)
// faithfully: 0.057113% (4791)
// 2 ulp: 0.056624% (4750)
// 3 ulp: 0.057054% (4786)
// 4 ulp: 0.056839% (4768)
// 5 ulp: 0.057185% (4797)
// >5 ulp: 99.686945% (8362348)
----------------------------------------------------------------------------
starting: sse_nr_s1_1 on [3f800000, 40000000]
// min/max ulp -2 3
// abs max ulp 3.327252
// correctly: 58.715444% (4925409)
// faithfully: 39.751287% (3334580)
// 2 ulp: 1.516497% (127213)
// 3 ulp: 0.016773% (1407)
----------------------------------------------------------------------------
starting: sse_nr_s2_1 on [3f800000, 40000000]
// min/max ulp -1 3
// abs max ulp 2.756861
// correctly: 68.920032% (5781432)
// faithfully: 30.598839% (2566817)
// 2 ulp: 0.480151% (40278)
// 3 ulp: 0.000978% (82)
----------------------------------------------------------------------------
starting: sse_nr_fp_1 on [3f800000, 40000000]
// min/max ulp -1 3
// abs max ulp 2.848300
// correctly: 68.949548% (5783908)
// faithfully: 30.556294% (2563248)
// 2 ulp: 0.493121% (41366)
// 3 ulp: 0.001037% (87)
----------------------------------------------------------------------------
starting: sse_nr_f2_1 on [3f800000, 40000000]
// min/max ulp 0 2
// abs max ulp 1.960132
// correctly: 84.322967% (7073524)
// faithfully: 15.620623% (1310353)
// 2 ulp: 0.056410% (4732)
----------------------------------------------------------------------------
starting: sse_nr_s1_2 on [3f800000, 40000000]
// min/max ulp -2 1
// abs max ulp 1.811487
// correctly: 70.040599% (5875432)
// faithfully: 29.943165% (2511815)
// 2 ulp: 0.016236% (1362)
----------------------------------------------------------------------------
starting: sse_nr_s2_2 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 1.248996
// correctly: 83.297946% (6987539)
// faithfully: 16.702054% (1401070)
----------------------------------------------------------------------------
starting: sse_nr_fp_2 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 1.249992
// correctly: 78.509786% (6585879)
// faithfully: 21.490214% (1802730)
----------------------------------------------------------------------------
starting: sse_nr_f2_2 on [3f800000, 40000000]
// min/max ulp 0 1
// abs max ulp 0.500000
// correctly: 99.999988% (8388608)
// faithfully: 0.000012% (1)
----------------------------------------------------------------------------
starting: sse_hm_1 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 0.500378
// correctly: 99.996150% (8388286)
// faithfully: 0.003850% (323)
----------------------------------------------------------------------------
starting: bit_rcp_approx on [3f800000, 40000000]
// min/max ulp -159498 1279760
// abs max ulp 1279760.000000
// correctly: 0.000107% (9)
// faithfully: 0.000203% (17)
// 2 ulp: 0.000203% (17)
// 3 ulp: 0.000215% (18)
// 4 ulp: 0.000203% (17)
// 5 ulp: 0.000203% (17)
// >5 ulp: 99.998868% (8388514)
----------------------------------------------------------------------------
starting: bit_nr_s1_1 on [3f800000, 40000000]
// min/max ulp -2 97621
// abs max ulp 97621.000002
// correctly: 0.375867% (31530)
// faithfully: 0.473118% (39688)
// 2 ulp: 0.263309% (22088)
// 3 ulp: 0.208855% (17520)
// 4 ulp: 0.178814% (15000)
// 5 ulp: 0.159001% (13338)
// >5 ulp: 98.341036% (8249445)
----------------------------------------------------------------------------
starting: bit_nr_s2_1 on [3f800000, 40000000]
// min/max ulp -1 97620
// abs max ulp 97620.000002
// correctly: 0.414848% (34800)
// faithfully: 0.442111% (37087)
// 2 ulp: 0.258505% (21685)
// 3 ulp: 0.207484% (17405)
// 4 ulp: 0.178170% (14946)
// 5 ulp: 0.159025% (13340)
// >5 ulp: 98.339856% (8249346)
----------------------------------------------------------------------------
starting: bit_nr_fp_1 on [3f800000, 40000000]
// min/max ulp -1 97620
// abs max ulp 97620.000009
// correctly: 0.414467% (34768)
// faithfully: 0.442910% (37154)
// 2 ulp: 0.257874% (21632)
// 3 ulp: 0.207269% (17387)
// 4 ulp: 0.178623% (14984)
// 5 ulp: 0.158823% (13323)
// >5 ulp: 98.340035% (8249361)
----------------------------------------------------------------------------
starting: bit_nr_f2_1 on [3f800000, 40000000]
// min/max ulp 0 97620
// abs max ulp 97620.000000
// correctly: 0.472355% (39624)
// faithfully: 0.391078% (32806)
// 2 ulp: 0.254703% (21366)
// 3 ulp: 0.206232% (17300)
// 4 ulp: 0.177562% (14895)
// 5 ulp: 0.158906% (13330)
// >5 ulp: 98.339164% (8249288)
----------------------------------------------------------------------------
starting: bit_nr_s1_2 on [3f800000, 40000000]
// min/max ulp -2 569
// abs max ulp 569.072379
// correctly: 26.625213% (2233485)
// faithfully: 23.364255% (1959936)
// 2 ulp: 2.729594% (228975)
// 3 ulp: 1.450908% (121711)
// 4 ulp: 1.125371% (94403)
// 5 ulp: 0.922000% (77343)
// >5 ulp: 43.782658% (3672756)
----------------------------------------------------------------------------
starting: bit_nr_s2_2 on [3f800000, 40000000]
// min/max ulp -1 568
// abs max ulp 568.072642
// correctly: 36.899193% (3095329)
// faithfully: 13.862108% (1162838)
// 2 ulp: 2.033031% (170543)
// 3 ulp: 1.418936% (119029)
// 4 ulp: 1.106083% (92785)
// 5 ulp: 0.915563% (76803)
// >5 ulp: 43.765087% (3671282)
----------------------------------------------------------------------------
starting: bit_nr_fp_2 on [3f800000, 40000000]
// min/max ulp -1 569
// abs max ulp 569.015503
// correctly: 30.147156% (2528927)
// faithfully: 20.481131% (1718082)
// 2 ulp: 2.129233% (178613)
// 3 ulp: 1.432490% (120166)
// 4 ulp: 1.116001% (93617)
// 5 ulp: 0.920248% (77196)
// >5 ulp: 43.773741% (3672008)
----------------------------------------------------------------------------
starting: bit_nr_f2_2 on [3f800000, 40000000]
// min/max ulp 0 568
// abs max ulp 568.022911
// correctly: 41.278727% (3462711)
// faithfully: 9.518491% (798469)
// 2 ulp: 2.011239% (168715)
// 3 ulp: 1.407313% (118054)
// 4 ulp: 1.107311% (92888)
// 5 ulp: 0.915146% (76768)
// >5 ulp: 43.761773% (3671004)
----------------------------------------------------------------------------
starting: bit_nr_s1_3 on [3f800000, 40000000]
// min/max ulp -2 2
// abs max ulp 2.005560
// correctly: 62.444310% (5238209)
// faithfully: 36.752136% (3082993)
// 2 ulp: 0.803554% (67407)
----------------------------------------------------------------------------
starting: bit_nr_s2_3 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 1.248562
// correctly: 83.570685% (7010418)
// faithfully: 16.429315% (1378191)
----------------------------------------------------------------------------
starting: bit_nr_fp_3 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 1.499907
// correctly: 72.277358% (6063065)
// faithfully: 27.722642% (2325544)
----------------------------------------------------------------------------
starting: bit_nr_f2_3 on [3f800000, 40000000]
// min/max ulp 0 1
// abs max ulp 0.518618
// correctly: 99.795926% (8371490)
// faithfully: 0.204074% (17119)
----------------------------------------------------------------------------
starting: bit_hm_1 on [3f800000, 40000000]
// min/max ulp -29 7446
// abs max ulp 7446.000748
// correctly: 8.088600% (678521)
// faithfully: 4.319632% (362357)
// 2 ulp: 2.463150% (206624)
// 3 ulp: 1.901841% (159538)
// 4 ulp: 1.596606% (133933)
// 5 ulp: 1.409435% (118232)
// >5 ulp: 80.220737% (6729404)
----------------------------------------------------------------------------
starting: bit_hm_2 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 0.501463
// correctly: 99.984026% (8387269)
// faithfully: 0.015974% (1340)
----------------------------------------------------------------------------
starting: bit_hm_3 on [3f800000, 40000000]
// min/max ulp 0 1
// abs max ulp 0.500000
// correctly: 99.999988% (8388608)
// faithfully: 0.000012% (1)
----------------------------------------------------------------------------
starting: day_rcp_approx on [3f800000, 40000000]
// min/max ulp -1768 1874
// abs max ulp 1874.000126
// correctly: 0.025952% (2177)
// faithfully: 0.052655% (4417)
// 2 ulp: 0.052273% (4385)
// 3 ulp: 0.052178% (4377)
// 4 ulp: 0.052631% (4415)
// 5 ulp: 0.052297% (4387)
// >5 ulp: 99.712014% (8364451)
----------------------------------------------------------------------------
starting: day_nr_s1_1 on [3f800000, 40000000]
// min/max ulp -2 2
// abs max ulp 2.133561
// correctly: 61.393754% (5150082)
// faithfully: 38.129146% (3198505)
// 2 ulp: 0.477099% (40022)
----------------------------------------------------------------------------
starting: day_nr_s2_1 on [3f800000, 40000000]
// min/max ulp -1 2
// abs max ulp 1.637964
// correctly: 72.490064% (6080908)
// faithfully: 27.491185% (2306128)
// 2 ulp: 0.018752% (1573)
----------------------------------------------------------------------------
starting: day_nr_fp_1 on [3f800000, 40000000]
// min/max ulp -1 2
// abs max ulp 1.657846
// correctly: 74.109808% (6216782)
// faithfully: 25.886568% (2171523)
// 2 ulp: 0.003624% (304)
----------------------------------------------------------------------------
starting: day_nr_f2_1 on [3f800000, 40000000]
// min/max ulp 0 1
// abs max ulp 0.701917
// correctly: 92.858673% (7789551)
// faithfully: 7.141327% (599058)
----------------------------------------------------------------------------
starting: day_nr_s1_2 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 1.499220
// correctly: 71.204535% (5973070)
// faithfully: 28.795465% (2415539)
----------------------------------------------------------------------------
starting: day_nr_s2_2 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 1.249477
// correctly: 82.748487% (6941447)
// faithfully: 17.251513% (1447162)
----------------------------------------------------------------------------
starting: day_nr_fp_2 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 1.249469
// correctly: 81.148519% (6807232)
// faithfully: 18.851481% (1581377)
----------------------------------------------------------------------------
starting: day_nr_f2_2 on [3f800000, 40000000]
// min/max ulp 0 1
// abs max ulp 0.500000
// correctly: 99.999988% (8388608)
// faithfully: 0.000012% (1)
----------------------------------------------------------------------------
starting: day_hm_1 on [3f800000, 40000000]
// min/max ulp -1 1
// abs max ulp 0.500096
// correctly: 99.997318% (8388384)
// faithfully: 0.002682% (225)
// Public Domain under http://unlicense.org, see link for details.
// compile with (or equiv):
// clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off recip.c -o recip -lm
#include <stdint.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#if defined(__FAST_MATH__) || defined(_M_FP_FAST)
#define FP_NAUGHTY_OPTIONS
#endif
// configuration section:
// additionally use MPFR to measure max error (requires -lmpfr to cmdline)
#define USE_MPFR
// run numbers for portable bithacking versions: recip_approx_{n}
#define INCLUDE_BITHACK
// run numbers for ARM inital approximation and NR step opcodes.
// WARNING: I wrote these from the spec and they could be incorrect.
// Too lazy to setup and test on real hardware.
#define INCLUDE_ARM
// to avoid using a compiler extension. sigh.
#define max_ulp_dist 5
#define STRINGIFY(S) STRINGIFY_(S)
#define STRINGIFY_(S) #S
#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))
float recip_ref(float x) { return 1.f/x; }
//********************************************************
// Newton-Ralphson steps
static inline float nr_step_s1(float y, float x)
{
return y*(2.f - x*y);
}
static inline float nr_step_s2(float y, float x)
{
return (y+y) - x*y*y;
}
static inline float nr_step_fp(float y, float x)
{
return -y*fmaf(x,y,-2.f);
}
static inline float nr_step_f2(float y, float x)
{
return fmaf(-fmaf(x,y,-1.f), y,y);
}
//********************************************************
// Halley's method
static inline float hm_step(float y, float x)
{
float t;
t = fmaf(y, -x, 1.0f);
y = fmaf(fmaf(t,t,t), y, y);
return y;
}
//********************************************************
// hardware approximation: SSE
#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
// wrap an intel intrinsic
#include <x86intrin.h>
#define F32_SSE_WRAP(F,X) _mm_cvtss_f32(F(_mm_set_ss(X)))
// return hardware 1/x approx
static inline float sse_rcp_approx(float x)
{
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x)));
}
#endif
//********************************************************
// hardware approximation: neon
inline uint32_t f32_to_bits(float x)
{
uint32_t u; memcpy(&u,&x,4); return u;
}
inline float f32_from_bits(uint32_t x)
{
float u; memcpy(&u,&x,4); return u;
}
// emulate neon approximation ops (std & not increased precision ones)
// ignore all special cases (including 0): don't
// need them for local testing. written to be eyeball-able to spec
float vrecpe_f32(float x)
{
// extract the parts
uint32_t ix = f32_to_bits(x);
uint32_t sx = ((ix >> 31) & 0x01);
uint32_t ex = ((ix >> 23) & 0xff);
uint32_t mx = ((ix ) & 0x7fffff);
uint32_t t;
mx = 0x100|(mx >> 15); // add implied bit and top 8 explict
ex = 253-ex; // adjust the exp
// SEE: ARM Spec - FPRecipEstimate
t = (mx<<1)+1;
t = (((1<<19)/t)+1)>>1;
mx = (t & 0xff) << 15;
// concat the parts back together
ix = (sx & 0x01) << 31;
ix |= (ex & 0xff) << 23;
ix |= (mx & 0x7fffff);
return f32_from_bits(ix);
}
// as the hardware op RECPS
static inline float vrecps_f32(float a, float b)
{
return -fmaf(a,b,-2.f);
}
// how you'd perform a 'step' with RECPS
static inline float arm_step(float y, float x)
{
return y*vrecps_f32(x,y);
}
//********************************************************
// bit hack version (well. it's portable!)
static inline float recip_inital_step(uint32_t K, float x)
{
return f32_from_bits(K-f32_to_bits(x));
}
// magic constant isn't optimal. building an optimal constant requires
// knowing the number and type of correction steps.
static inline float bit_rcp_approx(float x) { return recip_inital_step(0x7eec78f0,x); }
// "Generalising the Fast Reciprocal Square Root Algorithm", Mike Day, 2023
// (PDF: https://arxiv.org/abs/2307.15600)
// listing 8:
static inline float day_rcp_approx(float x)
{
static const float c0 = 0.6966215f;
static const float c1 = -0.12130684f;
float y = recip_inital_step(0x7fb504ec, x);
// ordering that minimizes >5 ulp results
return y * fmaf(x, y*c1, c0);
}
//********************************************************
// let the compiler expand the variants
// expand: 'inital' approximation followed by 'n' rounds of 'step'
inline float simple(float x, float (*initial)(float), float (*step)(float, float), uint32_t n)
{
float y = initial(x);
for(uint32_t i=0; i<n; i++)
y = step(y,x);
return y;
}
#if defined(INCLUDE_ARM)
// using hardware step
float arm_nr_op_1(float x) { return simple(x, &vrecpe_f32, &arm_step, 1); }
float arm_nr_op_2(float x) { return simple(x, &vrecpe_f32, &arm_step, 2); }
// using 2 fma NR step
float arm_nr_f2_1(float x) { return simple(x, &vrecpe_f32, &nr_step_f2, 1); }
float arm_nr_f2_2(float x) { return simple(x, &vrecpe_f32, &nr_step_f2, 2); }
float arm_hm_1(float x) { return simple(x, &vrecpe_f32, &hm_step, 1); }
#endif
// expand SSE
float sse_nr_s1_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_s1, 1); }
float sse_nr_s2_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_s2, 1); }
float sse_nr_fp_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_fp, 1); }
float sse_nr_f2_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_f2, 1); }
float sse_nr_s1_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_s1, 2); }
float sse_nr_s2_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_s2, 2); }
float sse_nr_fp_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_fp, 2); }
float sse_nr_f2_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_f2, 2); }
float sse_hm_1(float x) { return simple(x, &sse_rcp_approx, &hm_step, 1); }
#if defined(INCLUDE_BITHACK)
float bit_nr_s1_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 1); }
float bit_nr_s2_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 1); }
float bit_nr_fp_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 1); }
float bit_nr_f2_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 1); }
float bit_nr_s1_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 2); }
float bit_nr_s2_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 2); }
float bit_nr_fp_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 2); }
float bit_nr_f2_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 2); }
float bit_nr_s1_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 3); }
float bit_nr_s2_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 3); }
float bit_nr_fp_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 3); }
float bit_nr_f2_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 3); }
float bit_hm_1(float x) { return simple(x, &bit_rcp_approx, &hm_step, 1); }
float bit_hm_2(float x) { return simple(x, &bit_rcp_approx, &hm_step, 2); }
float bit_hm_3(float x) { return simple(x, &bit_rcp_approx, &hm_step, 3); }
#endif
float day_nr_s1_1(float x) { return simple(x, &day_rcp_approx, &nr_step_s1, 1); }
float day_nr_s2_1(float x) { return simple(x, &day_rcp_approx, &nr_step_s2, 1); }
float day_nr_fp_1(float x) { return simple(x, &day_rcp_approx, &nr_step_fp, 1); }
float day_nr_f2_1(float x) { return simple(x, &day_rcp_approx, &nr_step_f2, 1); }
float day_nr_s1_2(float x) { return simple(x, &day_rcp_approx, &nr_step_s1, 2); }
float day_nr_s2_2(float x) { return simple(x, &day_rcp_approx, &nr_step_s2, 2); }
float day_nr_fp_2(float x) { return simple(x, &day_rcp_approx, &nr_step_fp, 2); }
float day_nr_f2_2(float x) { return simple(x, &day_rcp_approx, &nr_step_f2, 2); }
float day_hm_1(float x) { return simple(x, &day_rcp_approx, &hm_step, 1); }
//********************************************************
typedef struct {
float (*f)(float);
char* name;
} recip_entry_t;
#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }
recip_entry_t recip_table[] =
{
#if defined(INCLUDE_ARM)
ENTRY(vrecpe_f32),
ENTRY(arm_nr_op_1),
ENTRY(arm_nr_f2_1),
ENTRY(arm_nr_op_2),
ENTRY(arm_nr_f2_2),
ENTRY(arm_hm_1),
#endif
// SSE expansions
ENTRY(sse_rcp_approx),
ENTRY(sse_nr_s1_1),
ENTRY(sse_nr_s2_1),
ENTRY(sse_nr_fp_1),
ENTRY(sse_nr_f2_1),
ENTRY(sse_nr_s1_2),
ENTRY(sse_nr_s2_2),
ENTRY(sse_nr_fp_2),
ENTRY(sse_nr_f2_2),
ENTRY(sse_hm_1),
#if defined(INCLUDE_BITHACK)
ENTRY(bit_rcp_approx),
ENTRY(bit_nr_s1_1),
ENTRY(bit_nr_s2_1),
ENTRY(bit_nr_fp_1),
ENTRY(bit_nr_f2_1),
ENTRY(bit_nr_s1_2),
ENTRY(bit_nr_s2_2),
ENTRY(bit_nr_fp_2),
ENTRY(bit_nr_f2_2),
ENTRY(bit_nr_s1_3),
ENTRY(bit_nr_s2_3),
ENTRY(bit_nr_fp_3),
ENTRY(bit_nr_f2_3),
ENTRY(bit_hm_1),
ENTRY(bit_hm_2),
ENTRY(bit_hm_3),
#endif
ENTRY(day_rcp_approx),
ENTRY(day_nr_s1_1),
ENTRY(day_nr_s2_1),
ENTRY(day_nr_fp_1),
ENTRY(day_nr_f2_1),
ENTRY(day_nr_s1_2),
ENTRY(day_nr_s2_2),
ENTRY(day_nr_fp_2),
ENTRY(day_nr_f2_2),
ENTRY(day_hm_1)
};
#undef ENTRY
//********************************************************
inline uint32_t u32_abs(uint32_t x)
{
return (int32_t)x >= 0 ? x : -x;
}
// signed distance: a & b must have same sign
inline int32_t f32_ulp_sdist_ss(float a, float b)
{
uint32_t ua = f32_to_bits(a);
uint32_t ub = f32_to_bits(b);
return (int32_t)(ua-ub);
}
#if defined(USE_MPFR)
#include <mpfr.h>
#endif
void accuracy_test(recip_entry_t* entry)
{
const uint32_t xs = f32_to_bits(1.f);
const uint32_t xe = f32_to_bits(2.f);
const uint32_t total = xe-xs+1;
#if defined(USE_MPFR)
mpfr_t m1,mx,mr;
double error_u = 0.0; // in rounding units
mpfr_init2(m1, 24);
mpfr_init2(mx, 24);
mpfr_init2(mr, 128);
mpfr_set_flt(m1, 1.f, MPFR_RNDN);
#endif
int32_t error_max = 0;
int32_t error_min = 0;
uint32_t error_cnt = 0;
printf("\n----------------------------------------------------------------------------\n");
printf("starting: %s on [%08x, %08x]\n", entry->name, xs,xe);
float (*f)(float) = entry->f;
uint32_t error[max_ulp_dist+1] = {0};
for(uint32_t xi=xs; xi<=xe; xi++) {
float x = f32_from_bits(xi);
float cr = 1.f/x;
float r0 = f(x);
if (r0 == cr)
error[0]++;
else {
int32_t u = f32_ulp_sdist_ss(cr,r0);
if (u > error_max) error_max = u;
if (u < error_min) error_min = u;
uint32_t u0 = u32_abs((uint32_t)u);
// track it if small enough for the buckets
if (u0 <= max_ulp_dist)
error[u0]++;
else
error_cnt++;
#if defined(USE_MPFR)
// recompute in MP
mpfr_set_flt(mx, x, MPFR_RNDN);
mpfr_div(mr,m1,mx, MPFR_RNDN);
// absolute error
mpfr_set_flt(mx, r0, MPFR_RNDN);
mpfr_sub(mr,mr,mx, MPFR_RNDA);
// WARNING: cheating: only works on (1,2) - the
// product. should be correct by exp of result
double merror = 0x1p24f*fabs(mpfr_get_d(mr, MPFR_RNDA));
if (merror > error_u) error_u = merror;
#endif
}
}
printf("// min/max ulp %d %d\n", error_min, error_max);
#if defined(USE_MPFR)
printf("// abs max ulp %f\n", error_u);
#endif
printf("// correctly: %10f%% (%d)\n", 100.0*(double)(error[0])/(double)(total), error[0]);
if (error[1])
printf("// faithfully: %10f%% (%d)\n", 100.0*(double)(error[1])/(double)(total), error[1]);
for(uint32_t i=2; i<=max_ulp_dist; i++)
if (error[i])
printf("// %3d ulp: %10f%% (%d)\n", i,100.0*(double)(error[i])/(double)(total), error[i]);
if (error_cnt)
printf("// >%d ulp: %10f%% (%d)\n", max_ulp_dist,100.0*(double)(error_cnt)/(double)(total), error_cnt);
}
//********************************************************
int main(void)
{
#if !defined(FP_NAUGHTY_OPTIONS)
for(uint32_t i=0; i<LENGTHOF(recip_table); i++)
accuracy_test(recip_table+i);
#else
printf("LOL! No\n");
#endif
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment