Last active
August 7, 2023 13:20
-
-
Save Marc-B-Reynolds/bab0efc00c55be52f8663a762131ada2 to your computer and use it in GitHub Desktop.
Error bound checks for 1/x approximation: sse, arm and hacky initial value
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
---------------------------------------------------------------------------- | |
starting: vrecpe_f32 on [3f800000, 40000000] | |
// min/max ulp -45502 44978 | |
// abs max ulp 45502.375051 | |
// correctly: 0.002611% (219) | |
// faithfully: 0.005555% (466) | |
// 2 ulp: 0.005436% (456) | |
// 3 ulp: 0.005460% (458) | |
// 4 ulp: 0.005329% (447) | |
// 5 ulp: 0.005531% (464) | |
// >5 ulp: 99.970078% (8386099) | |
---------------------------------------------------------------------------- | |
starting: arm_nr_op_1 on [3f800000, 40000000] | |
// min/max ulp -1 129 | |
// abs max ulp 128.903094 | |
// correctly: 10.676442% (895605) | |
// faithfully: 11.139952% (934487) | |
// 2 ulp: 6.347620% (532477) | |
// 3 ulp: 4.953718% (415548) | |
// 4 ulp: 4.155027% (348549) | |
// 5 ulp: 3.647220% (305951) | |
// >5 ulp: 59.080021% (4955992) | |
---------------------------------------------------------------------------- | |
starting: arm_nr_f2_1 on [3f800000, 40000000] | |
// min/max ulp 0 128 | |
// abs max ulp 128.090785 | |
// correctly: 12.101339% (1015134) | |
// faithfully: 9.859215% (827051) | |
// 2 ulp: 6.272411% (526168) | |
// 3 ulp: 4.919719% (412696) | |
// 4 ulp: 4.147815% (347944) | |
// 5 ulp: 3.636097% (305018) | |
// >5 ulp: 59.063404% (4954598) | |
---------------------------------------------------------------------------- | |
starting: arm_nr_op_2 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 1.487251 | |
// correctly: 67.627648% (5673019) | |
// faithfully: 32.372352% (2715590) | |
---------------------------------------------------------------------------- | |
starting: arm_nr_f2_2 on [3f800000, 40000000] | |
// min/max ulp 0 1 | |
// abs max ulp 0.500814 | |
// correctly: 99.994457% (8388144) | |
// faithfully: 0.005543% (465) | |
---------------------------------------------------------------------------- | |
starting: arm_hm_1 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 0.855433 | |
// correctly: 97.301376% (8162232) | |
// faithfully: 2.698624% (226377) | |
---------------------------------------------------------------------------- | |
starting: sse_rcp_approx on [3f800000, 40000000] | |
// min/max ulp -4996 4989 | |
// abs max ulp 4995.549501 | |
// correctly: 0.028241% (2369) | |
// faithfully: 0.057113% (4791) | |
// 2 ulp: 0.056624% (4750) | |
// 3 ulp: 0.057054% (4786) | |
// 4 ulp: 0.056839% (4768) | |
// 5 ulp: 0.057185% (4797) | |
// >5 ulp: 99.686945% (8362348) | |
---------------------------------------------------------------------------- | |
starting: sse_nr_s1_1 on [3f800000, 40000000] | |
// min/max ulp -2 3 | |
// abs max ulp 3.327252 | |
// correctly: 58.715444% (4925409) | |
// faithfully: 39.751287% (3334580) | |
// 2 ulp: 1.516497% (127213) | |
// 3 ulp: 0.016773% (1407) | |
---------------------------------------------------------------------------- | |
starting: sse_nr_s2_1 on [3f800000, 40000000] | |
// min/max ulp -1 3 | |
// abs max ulp 2.756861 | |
// correctly: 68.920032% (5781432) | |
// faithfully: 30.598839% (2566817) | |
// 2 ulp: 0.480151% (40278) | |
// 3 ulp: 0.000978% (82) | |
---------------------------------------------------------------------------- | |
starting: sse_nr_fp_1 on [3f800000, 40000000] | |
// min/max ulp -1 3 | |
// abs max ulp 2.848300 | |
// correctly: 68.949548% (5783908) | |
// faithfully: 30.556294% (2563248) | |
// 2 ulp: 0.493121% (41366) | |
// 3 ulp: 0.001037% (87) | |
---------------------------------------------------------------------------- | |
starting: sse_nr_f2_1 on [3f800000, 40000000] | |
// min/max ulp 0 2 | |
// abs max ulp 1.960132 | |
// correctly: 84.322967% (7073524) | |
// faithfully: 15.620623% (1310353) | |
// 2 ulp: 0.056410% (4732) | |
---------------------------------------------------------------------------- | |
starting: sse_nr_s1_2 on [3f800000, 40000000] | |
// min/max ulp -2 1 | |
// abs max ulp 1.811487 | |
// correctly: 70.040599% (5875432) | |
// faithfully: 29.943165% (2511815) | |
// 2 ulp: 0.016236% (1362) | |
---------------------------------------------------------------------------- | |
starting: sse_nr_s2_2 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 1.248996 | |
// correctly: 83.297946% (6987539) | |
// faithfully: 16.702054% (1401070) | |
---------------------------------------------------------------------------- | |
starting: sse_nr_fp_2 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 1.249992 | |
// correctly: 78.509786% (6585879) | |
// faithfully: 21.490214% (1802730) | |
---------------------------------------------------------------------------- | |
starting: sse_nr_f2_2 on [3f800000, 40000000] | |
// min/max ulp 0 1 | |
// abs max ulp 0.500000 | |
// correctly: 99.999988% (8388608) | |
// faithfully: 0.000012% (1) | |
---------------------------------------------------------------------------- | |
starting: sse_hm_1 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 0.500378 | |
// correctly: 99.996150% (8388286) | |
// faithfully: 0.003850% (323) | |
---------------------------------------------------------------------------- | |
starting: bit_rcp_approx on [3f800000, 40000000] | |
// min/max ulp -159498 1279760 | |
// abs max ulp 1279760.000000 | |
// correctly: 0.000107% (9) | |
// faithfully: 0.000203% (17) | |
// 2 ulp: 0.000203% (17) | |
// 3 ulp: 0.000215% (18) | |
// 4 ulp: 0.000203% (17) | |
// 5 ulp: 0.000203% (17) | |
// >5 ulp: 99.998868% (8388514) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_s1_1 on [3f800000, 40000000] | |
// min/max ulp -2 97621 | |
// abs max ulp 97621.000002 | |
// correctly: 0.375867% (31530) | |
// faithfully: 0.473118% (39688) | |
// 2 ulp: 0.263309% (22088) | |
// 3 ulp: 0.208855% (17520) | |
// 4 ulp: 0.178814% (15000) | |
// 5 ulp: 0.159001% (13338) | |
// >5 ulp: 98.341036% (8249445) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_s2_1 on [3f800000, 40000000] | |
// min/max ulp -1 97620 | |
// abs max ulp 97620.000002 | |
// correctly: 0.414848% (34800) | |
// faithfully: 0.442111% (37087) | |
// 2 ulp: 0.258505% (21685) | |
// 3 ulp: 0.207484% (17405) | |
// 4 ulp: 0.178170% (14946) | |
// 5 ulp: 0.159025% (13340) | |
// >5 ulp: 98.339856% (8249346) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_fp_1 on [3f800000, 40000000] | |
// min/max ulp -1 97620 | |
// abs max ulp 97620.000009 | |
// correctly: 0.414467% (34768) | |
// faithfully: 0.442910% (37154) | |
// 2 ulp: 0.257874% (21632) | |
// 3 ulp: 0.207269% (17387) | |
// 4 ulp: 0.178623% (14984) | |
// 5 ulp: 0.158823% (13323) | |
// >5 ulp: 98.340035% (8249361) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_f2_1 on [3f800000, 40000000] | |
// min/max ulp 0 97620 | |
// abs max ulp 97620.000000 | |
// correctly: 0.472355% (39624) | |
// faithfully: 0.391078% (32806) | |
// 2 ulp: 0.254703% (21366) | |
// 3 ulp: 0.206232% (17300) | |
// 4 ulp: 0.177562% (14895) | |
// 5 ulp: 0.158906% (13330) | |
// >5 ulp: 98.339164% (8249288) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_s1_2 on [3f800000, 40000000] | |
// min/max ulp -2 569 | |
// abs max ulp 569.072379 | |
// correctly: 26.625213% (2233485) | |
// faithfully: 23.364255% (1959936) | |
// 2 ulp: 2.729594% (228975) | |
// 3 ulp: 1.450908% (121711) | |
// 4 ulp: 1.125371% (94403) | |
// 5 ulp: 0.922000% (77343) | |
// >5 ulp: 43.782658% (3672756) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_s2_2 on [3f800000, 40000000] | |
// min/max ulp -1 568 | |
// abs max ulp 568.072642 | |
// correctly: 36.899193% (3095329) | |
// faithfully: 13.862108% (1162838) | |
// 2 ulp: 2.033031% (170543) | |
// 3 ulp: 1.418936% (119029) | |
// 4 ulp: 1.106083% (92785) | |
// 5 ulp: 0.915563% (76803) | |
// >5 ulp: 43.765087% (3671282) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_fp_2 on [3f800000, 40000000] | |
// min/max ulp -1 569 | |
// abs max ulp 569.015503 | |
// correctly: 30.147156% (2528927) | |
// faithfully: 20.481131% (1718082) | |
// 2 ulp: 2.129233% (178613) | |
// 3 ulp: 1.432490% (120166) | |
// 4 ulp: 1.116001% (93617) | |
// 5 ulp: 0.920248% (77196) | |
// >5 ulp: 43.773741% (3672008) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_f2_2 on [3f800000, 40000000] | |
// min/max ulp 0 568 | |
// abs max ulp 568.022911 | |
// correctly: 41.278727% (3462711) | |
// faithfully: 9.518491% (798469) | |
// 2 ulp: 2.011239% (168715) | |
// 3 ulp: 1.407313% (118054) | |
// 4 ulp: 1.107311% (92888) | |
// 5 ulp: 0.915146% (76768) | |
// >5 ulp: 43.761773% (3671004) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_s1_3 on [3f800000, 40000000] | |
// min/max ulp -2 2 | |
// abs max ulp 2.005560 | |
// correctly: 62.444310% (5238209) | |
// faithfully: 36.752136% (3082993) | |
// 2 ulp: 0.803554% (67407) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_s2_3 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 1.248562 | |
// correctly: 83.570685% (7010418) | |
// faithfully: 16.429315% (1378191) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_fp_3 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 1.499907 | |
// correctly: 72.277358% (6063065) | |
// faithfully: 27.722642% (2325544) | |
---------------------------------------------------------------------------- | |
starting: bit_nr_f2_3 on [3f800000, 40000000] | |
// min/max ulp 0 1 | |
// abs max ulp 0.518618 | |
// correctly: 99.795926% (8371490) | |
// faithfully: 0.204074% (17119) | |
---------------------------------------------------------------------------- | |
starting: bit_hm_1 on [3f800000, 40000000] | |
// min/max ulp -29 7446 | |
// abs max ulp 7446.000748 | |
// correctly: 8.088600% (678521) | |
// faithfully: 4.319632% (362357) | |
// 2 ulp: 2.463150% (206624) | |
// 3 ulp: 1.901841% (159538) | |
// 4 ulp: 1.596606% (133933) | |
// 5 ulp: 1.409435% (118232) | |
// >5 ulp: 80.220737% (6729404) | |
---------------------------------------------------------------------------- | |
starting: bit_hm_2 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 0.501463 | |
// correctly: 99.984026% (8387269) | |
// faithfully: 0.015974% (1340) | |
---------------------------------------------------------------------------- | |
starting: bit_hm_3 on [3f800000, 40000000] | |
// min/max ulp 0 1 | |
// abs max ulp 0.500000 | |
// correctly: 99.999988% (8388608) | |
// faithfully: 0.000012% (1) | |
---------------------------------------------------------------------------- | |
starting: day_rcp_approx on [3f800000, 40000000] | |
// min/max ulp -1768 1874 | |
// abs max ulp 1874.000126 | |
// correctly: 0.025952% (2177) | |
// faithfully: 0.052655% (4417) | |
// 2 ulp: 0.052273% (4385) | |
// 3 ulp: 0.052178% (4377) | |
// 4 ulp: 0.052631% (4415) | |
// 5 ulp: 0.052297% (4387) | |
// >5 ulp: 99.712014% (8364451) | |
---------------------------------------------------------------------------- | |
starting: day_nr_s1_1 on [3f800000, 40000000] | |
// min/max ulp -2 2 | |
// abs max ulp 2.133561 | |
// correctly: 61.393754% (5150082) | |
// faithfully: 38.129146% (3198505) | |
// 2 ulp: 0.477099% (40022) | |
---------------------------------------------------------------------------- | |
starting: day_nr_s2_1 on [3f800000, 40000000] | |
// min/max ulp -1 2 | |
// abs max ulp 1.637964 | |
// correctly: 72.490064% (6080908) | |
// faithfully: 27.491185% (2306128) | |
// 2 ulp: 0.018752% (1573) | |
---------------------------------------------------------------------------- | |
starting: day_nr_fp_1 on [3f800000, 40000000] | |
// min/max ulp -1 2 | |
// abs max ulp 1.657846 | |
// correctly: 74.109808% (6216782) | |
// faithfully: 25.886568% (2171523) | |
// 2 ulp: 0.003624% (304) | |
---------------------------------------------------------------------------- | |
starting: day_nr_f2_1 on [3f800000, 40000000] | |
// min/max ulp 0 1 | |
// abs max ulp 0.701917 | |
// correctly: 92.858673% (7789551) | |
// faithfully: 7.141327% (599058) | |
---------------------------------------------------------------------------- | |
starting: day_nr_s1_2 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 1.499220 | |
// correctly: 71.204535% (5973070) | |
// faithfully: 28.795465% (2415539) | |
---------------------------------------------------------------------------- | |
starting: day_nr_s2_2 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 1.249477 | |
// correctly: 82.748487% (6941447) | |
// faithfully: 17.251513% (1447162) | |
---------------------------------------------------------------------------- | |
starting: day_nr_fp_2 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 1.249469 | |
// correctly: 81.148519% (6807232) | |
// faithfully: 18.851481% (1581377) | |
---------------------------------------------------------------------------- | |
starting: day_nr_f2_2 on [3f800000, 40000000] | |
// min/max ulp 0 1 | |
// abs max ulp 0.500000 | |
// correctly: 99.999988% (8388608) | |
// faithfully: 0.000012% (1) | |
---------------------------------------------------------------------------- | |
starting: day_hm_1 on [3f800000, 40000000] | |
// min/max ulp -1 1 | |
// abs max ulp 0.500096 | |
// correctly: 99.997318% (8388384) | |
// faithfully: 0.002682% (225) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Public Domain under http://unlicense.org, see link for details. | |
// compile with (or equiv): | |
// clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off recip.c -o recip -lm | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <math.h> | |
#include <string.h> | |
#if defined(__FAST_MATH__) || defined(_M_FP_FAST) | |
#define FP_NAUGHTY_OPTIONS | |
#endif | |
// configuration section: | |
// additionally use MPFR to measure max error (requires -lmpfr to cmdline) | |
#define USE_MPFR | |
// run numbers for portable bithacking versions: recip_approx_{n} | |
#define INCLUDE_BITHACK | |
// run numbers for ARM inital approximation and NR step opcodes. | |
// WARNING: I wrote these from the spec and they could be incorrect. | |
// Too lazy to setup and test on real hardware. | |
#define INCLUDE_ARM | |
// to avoid using a compiler extension. sigh. | |
#define max_ulp_dist 5 | |
#define STRINGIFY(S) STRINGIFY_(S) | |
#define STRINGIFY_(S) #S | |
#define LENGTHOF(X) (sizeof(X)/sizeof(X[0])) | |
float recip_ref(float x) { return 1.f/x; } | |
//******************************************************** | |
// Newton-Ralphson steps | |
static inline float nr_step_s1(float y, float x) | |
{ | |
return y*(2.f - x*y); | |
} | |
static inline float nr_step_s2(float y, float x) | |
{ | |
return (y+y) - x*y*y; | |
} | |
static inline float nr_step_fp(float y, float x) | |
{ | |
return -y*fmaf(x,y,-2.f); | |
} | |
static inline float nr_step_f2(float y, float x) | |
{ | |
return fmaf(-fmaf(x,y,-1.f), y,y); | |
} | |
//******************************************************** | |
// Halley's method | |
static inline float hm_step(float y, float x) | |
{ | |
float t; | |
t = fmaf(y, -x, 1.0f); | |
y = fmaf(fmaf(t,t,t), y, y); | |
return y; | |
} | |
//******************************************************** | |
// hardware approximation: SSE | |
#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) | |
// wrap an intel intrinsic | |
#include <x86intrin.h> | |
#define F32_SSE_WRAP(F,X) _mm_cvtss_f32(F(_mm_set_ss(X))) | |
// return hardware 1/x approx | |
static inline float sse_rcp_approx(float x) | |
{ | |
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x))); | |
} | |
#endif | |
//******************************************************** | |
// hardware approximation: neon | |
inline uint32_t f32_to_bits(float x) | |
{ | |
uint32_t u; memcpy(&u,&x,4); return u; | |
} | |
inline float f32_from_bits(uint32_t x) | |
{ | |
float u; memcpy(&u,&x,4); return u; | |
} | |
// emulate neon approximation ops (std & not increased precision ones) | |
// ignore all special cases (including 0): don't | |
// need them for local testing. written to be eyeball-able to spec | |
float vrecpe_f32(float x) | |
{ | |
// extract the parts | |
uint32_t ix = f32_to_bits(x); | |
uint32_t sx = ((ix >> 31) & 0x01); | |
uint32_t ex = ((ix >> 23) & 0xff); | |
uint32_t mx = ((ix ) & 0x7fffff); | |
uint32_t t; | |
mx = 0x100|(mx >> 15); // add implied bit and top 8 explict | |
ex = 253-ex; // adjust the exp | |
// SEE: ARM Spec - FPRecipEstimate | |
t = (mx<<1)+1; | |
t = (((1<<19)/t)+1)>>1; | |
mx = (t & 0xff) << 15; | |
// concat the parts back together | |
ix = (sx & 0x01) << 31; | |
ix |= (ex & 0xff) << 23; | |
ix |= (mx & 0x7fffff); | |
return f32_from_bits(ix); | |
} | |
// as the hardware op RECPS | |
static inline float vrecps_f32(float a, float b) | |
{ | |
return -fmaf(a,b,-2.f); | |
} | |
// how you'd perform a 'step' with RECPS | |
static inline float arm_step(float y, float x) | |
{ | |
return y*vrecps_f32(x,y); | |
} | |
//******************************************************** | |
// bit hack version (well. it's portable!) | |
static inline float recip_inital_step(uint32_t K, float x) | |
{ | |
return f32_from_bits(K-f32_to_bits(x)); | |
} | |
// magic constant isn't optimal. building an optimal constant requires | |
// knowing the number and type of correction steps. | |
static inline float bit_rcp_approx(float x) { return recip_inital_step(0x7eec78f0,x); } | |
// "Generalising the Fast Reciprocal Square Root Algorithm", Mike Day, 2023 | |
// (PDF: https://arxiv.org/abs/2307.15600) | |
// listing 8: | |
static inline float day_rcp_approx(float x) | |
{ | |
static const float c0 = 0.6966215f; | |
static const float c1 = -0.12130684f; | |
float y = recip_inital_step(0x7fb504ec, x); | |
// ordering that minimizes >5 ulp results | |
return y * fmaf(x, y*c1, c0); | |
} | |
//******************************************************** | |
// let the compiler expand the variants | |
// expand: 'inital' approximation followed by 'n' rounds of 'step' | |
inline float simple(float x, float (*initial)(float), float (*step)(float, float), uint32_t n) | |
{ | |
float y = initial(x); | |
for(uint32_t i=0; i<n; i++) | |
y = step(y,x); | |
return y; | |
} | |
#if defined(INCLUDE_ARM) | |
// using hardware step | |
float arm_nr_op_1(float x) { return simple(x, &vrecpe_f32, &arm_step, 1); } | |
float arm_nr_op_2(float x) { return simple(x, &vrecpe_f32, &arm_step, 2); } | |
// using 2 fma NR step | |
float arm_nr_f2_1(float x) { return simple(x, &vrecpe_f32, &nr_step_f2, 1); } | |
float arm_nr_f2_2(float x) { return simple(x, &vrecpe_f32, &nr_step_f2, 2); } | |
float arm_hm_1(float x) { return simple(x, &vrecpe_f32, &hm_step, 1); } | |
#endif | |
// expand SSE | |
float sse_nr_s1_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_s1, 1); } | |
float sse_nr_s2_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_s2, 1); } | |
float sse_nr_fp_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_fp, 1); } | |
float sse_nr_f2_1(float x) { return simple(x, &sse_rcp_approx, &nr_step_f2, 1); } | |
float sse_nr_s1_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_s1, 2); } | |
float sse_nr_s2_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_s2, 2); } | |
float sse_nr_fp_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_fp, 2); } | |
float sse_nr_f2_2(float x) { return simple(x, &sse_rcp_approx, &nr_step_f2, 2); } | |
float sse_hm_1(float x) { return simple(x, &sse_rcp_approx, &hm_step, 1); } | |
#if defined(INCLUDE_BITHACK) | |
float bit_nr_s1_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 1); } | |
float bit_nr_s2_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 1); } | |
float bit_nr_fp_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 1); } | |
float bit_nr_f2_1(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 1); } | |
float bit_nr_s1_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 2); } | |
float bit_nr_s2_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 2); } | |
float bit_nr_fp_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 2); } | |
float bit_nr_f2_2(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 2); } | |
float bit_nr_s1_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_s1, 3); } | |
float bit_nr_s2_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_s2, 3); } | |
float bit_nr_fp_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_fp, 3); } | |
float bit_nr_f2_3(float x) { return simple(x, &bit_rcp_approx, &nr_step_f2, 3); } | |
float bit_hm_1(float x) { return simple(x, &bit_rcp_approx, &hm_step, 1); } | |
float bit_hm_2(float x) { return simple(x, &bit_rcp_approx, &hm_step, 2); } | |
float bit_hm_3(float x) { return simple(x, &bit_rcp_approx, &hm_step, 3); } | |
#endif | |
float day_nr_s1_1(float x) { return simple(x, &day_rcp_approx, &nr_step_s1, 1); } | |
float day_nr_s2_1(float x) { return simple(x, &day_rcp_approx, &nr_step_s2, 1); } | |
float day_nr_fp_1(float x) { return simple(x, &day_rcp_approx, &nr_step_fp, 1); } | |
float day_nr_f2_1(float x) { return simple(x, &day_rcp_approx, &nr_step_f2, 1); } | |
float day_nr_s1_2(float x) { return simple(x, &day_rcp_approx, &nr_step_s1, 2); } | |
float day_nr_s2_2(float x) { return simple(x, &day_rcp_approx, &nr_step_s2, 2); } | |
float day_nr_fp_2(float x) { return simple(x, &day_rcp_approx, &nr_step_fp, 2); } | |
float day_nr_f2_2(float x) { return simple(x, &day_rcp_approx, &nr_step_f2, 2); } | |
float day_hm_1(float x) { return simple(x, &day_rcp_approx, &hm_step, 1); } | |
//******************************************************** | |
typedef struct { | |
float (*f)(float); | |
char* name; | |
} recip_entry_t; | |
#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) } | |
recip_entry_t recip_table[] = | |
{ | |
#if defined(INCLUDE_ARM) | |
ENTRY(vrecpe_f32), | |
ENTRY(arm_nr_op_1), | |
ENTRY(arm_nr_f2_1), | |
ENTRY(arm_nr_op_2), | |
ENTRY(arm_nr_f2_2), | |
ENTRY(arm_hm_1), | |
#endif | |
// SSE expansions | |
ENTRY(sse_rcp_approx), | |
ENTRY(sse_nr_s1_1), | |
ENTRY(sse_nr_s2_1), | |
ENTRY(sse_nr_fp_1), | |
ENTRY(sse_nr_f2_1), | |
ENTRY(sse_nr_s1_2), | |
ENTRY(sse_nr_s2_2), | |
ENTRY(sse_nr_fp_2), | |
ENTRY(sse_nr_f2_2), | |
ENTRY(sse_hm_1), | |
#if defined(INCLUDE_BITHACK) | |
ENTRY(bit_rcp_approx), | |
ENTRY(bit_nr_s1_1), | |
ENTRY(bit_nr_s2_1), | |
ENTRY(bit_nr_fp_1), | |
ENTRY(bit_nr_f2_1), | |
ENTRY(bit_nr_s1_2), | |
ENTRY(bit_nr_s2_2), | |
ENTRY(bit_nr_fp_2), | |
ENTRY(bit_nr_f2_2), | |
ENTRY(bit_nr_s1_3), | |
ENTRY(bit_nr_s2_3), | |
ENTRY(bit_nr_fp_3), | |
ENTRY(bit_nr_f2_3), | |
ENTRY(bit_hm_1), | |
ENTRY(bit_hm_2), | |
ENTRY(bit_hm_3), | |
#endif | |
ENTRY(day_rcp_approx), | |
ENTRY(day_nr_s1_1), | |
ENTRY(day_nr_s2_1), | |
ENTRY(day_nr_fp_1), | |
ENTRY(day_nr_f2_1), | |
ENTRY(day_nr_s1_2), | |
ENTRY(day_nr_s2_2), | |
ENTRY(day_nr_fp_2), | |
ENTRY(day_nr_f2_2), | |
ENTRY(day_hm_1) | |
}; | |
#undef ENTRY | |
//******************************************************** | |
inline uint32_t u32_abs(uint32_t x) | |
{ | |
return (int32_t)x >= 0 ? x : -x; | |
} | |
// signed distance: a & b must have same sign | |
inline int32_t f32_ulp_sdist_ss(float a, float b) | |
{ | |
uint32_t ua = f32_to_bits(a); | |
uint32_t ub = f32_to_bits(b); | |
return (int32_t)(ua-ub); | |
} | |
#if defined(USE_MPFR) | |
#include <mpfr.h> | |
#endif | |
void accuracy_test(recip_entry_t* entry) | |
{ | |
const uint32_t xs = f32_to_bits(1.f); | |
const uint32_t xe = f32_to_bits(2.f); | |
const uint32_t total = xe-xs+1; | |
#if defined(USE_MPFR) | |
mpfr_t m1,mx,mr; | |
double error_u = 0.0; // in rounding units | |
mpfr_init2(m1, 24); | |
mpfr_init2(mx, 24); | |
mpfr_init2(mr, 128); | |
mpfr_set_flt(m1, 1.f, MPFR_RNDN); | |
#endif | |
int32_t error_max = 0; | |
int32_t error_min = 0; | |
uint32_t error_cnt = 0; | |
printf("\n----------------------------------------------------------------------------\n"); | |
printf("starting: %s on [%08x, %08x]\n", entry->name, xs,xe); | |
float (*f)(float) = entry->f; | |
uint32_t error[max_ulp_dist+1] = {0}; | |
for(uint32_t xi=xs; xi<=xe; xi++) { | |
float x = f32_from_bits(xi); | |
float cr = 1.f/x; | |
float r0 = f(x); | |
if (r0 == cr) | |
error[0]++; | |
else { | |
int32_t u = f32_ulp_sdist_ss(cr,r0); | |
if (u > error_max) error_max = u; | |
if (u < error_min) error_min = u; | |
uint32_t u0 = u32_abs((uint32_t)u); | |
// track it if small enough for the buckets | |
if (u0 <= max_ulp_dist) | |
error[u0]++; | |
else | |
error_cnt++; | |
#if defined(USE_MPFR) | |
// recompute in MP | |
mpfr_set_flt(mx, x, MPFR_RNDN); | |
mpfr_div(mr,m1,mx, MPFR_RNDN); | |
// absolute error | |
mpfr_set_flt(mx, r0, MPFR_RNDN); | |
mpfr_sub(mr,mr,mx, MPFR_RNDA); | |
// WARNING: cheating: only works on (1,2) - the | |
// product. should be correct by exp of result | |
double merror = 0x1p24f*fabs(mpfr_get_d(mr, MPFR_RNDA)); | |
if (merror > error_u) error_u = merror; | |
#endif | |
} | |
} | |
printf("// min/max ulp %d %d\n", error_min, error_max); | |
#if defined(USE_MPFR) | |
printf("// abs max ulp %f\n", error_u); | |
#endif | |
printf("// correctly: %10f%% (%d)\n", 100.0*(double)(error[0])/(double)(total), error[0]); | |
if (error[1]) | |
printf("// faithfully: %10f%% (%d)\n", 100.0*(double)(error[1])/(double)(total), error[1]); | |
for(uint32_t i=2; i<=max_ulp_dist; i++) | |
if (error[i]) | |
printf("// %3d ulp: %10f%% (%d)\n", i,100.0*(double)(error[i])/(double)(total), error[i]); | |
if (error_cnt) | |
printf("// >%d ulp: %10f%% (%d)\n", max_ulp_dist,100.0*(double)(error_cnt)/(double)(total), error_cnt); | |
} | |
//******************************************************** | |
int main(void) | |
{ | |
#if !defined(FP_NAUGHTY_OPTIONS) | |
for(uint32_t i=0; i<LENGTHOF(recip_table); i++) | |
accuracy_test(recip_table+i); | |
#else | |
printf("LOL! No\n"); | |
#endif | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment