Marc-B-Reynolds/cbrt.c

## cbrt.c
// Public Domain under http://unlicense.org, see link for details.
// except:
// * core-math function `cr_cbrtf`               (see license below)
// * musl flavored fdlib function `fdlibm_cbrtf` (see license below)

// code and test driver for cube root and it's reciprocal based on:
// "Fast Calculation of Cube and Inverse Cube Roots Using
// a Magic Constant and Its Implementation on Microcontrollers"
// Moroz, Samotyy, Walczyk, Cieslinski, 2021
// (PDF: https://www.mdpi.com/1996-1073/14/4/1058)
//
// Added some modification with higher powered devices in mind
// (standard PCs, consoles, phones, etc).

// NOW INCLUDES: for testing normal cube root instead of MPFR. MPFR will be
// used instead if USE_MPFR is defined. Can also "cheat" & test 1/cbrt
// using libm & doubles. Needs to be validate on a given libm & range
// So if USE_MPFR and USE_DOUBLES_LIKE_A_CHEATER are both defined then
// MPFR isn't needed.

/* Correctly-rounded cubic root of binary32 value.

Copyright (c) 2022 Alexei Sibidanov.

This file is part of the CORE-MATH project
(https://core-math.gitlabpages.inria.fr/).

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

/* origin: FreeBSD /usr/src/lib/msun/src/s_cbrtf.c */
/*
 * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
 * Debugged and optimized by Bruce D. Evans.
 */
/*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */

// if define uses MPFR for test cbrt instead of core-math's `cr_cbrtf`
//#define USE_MPFR

// if defined uses `lib_rcbrt_dp` instead of MPFR to test reciprocial
// cube root. On my system this happens to return correctly rounded results
// for the primary test range.  (beware! should validate the range with a given
// libm before using.)
#define USE_DOUBLES_LIKE_A_CHEATER

// compile with (or equiv). (requires MPFR installed..unless as noted above)
// clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off cbrt.c -o cbrt -lm -lmpfr

// compile time macros:
//   NO_RCBRT  : bypass testing 1/cbrt
//   NO_CBRT   : bypass testing cbrt
//   WIP_CBRT  : test only the cbrt function defined
//   WIP_RCBRT : test only the 1/cbrt function defined


#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <assert.h>

#if !defined(USE_MPFR) && !defined(USE_DOUBLES_LIKE_A_CHEATER)
#include <mpfr.h>
#endif

//********************************************************
// CONFIG STUFF HERE

// number of ULP errors to track individually
// (1 is used for post, 5 for source comments)
// using define to avoid using a compiler extension. sigh.
#define max_ulp_dist 5


// choose smallest value of the test interval
const float test_start_value = 0.25f;            // cover [1/4,2] (both sides of 1)

//const float test_start_value = 0.5f;             // boring, but useful
//const float test_start_value = 0x1.0p-126f;   // smallest mag. normal
//const float test_start_value = 0x1.0p+122f;   // big but no overflow
// max finite exponent is 127, we need to subtract 3
// to cover the test range (124) and compute values
// to the 4th power (another -2 = 122)


// full range variants and perform sign validation: cbrt(-x) = -cbrt(x)
#define TEST_FULL_RANGE

static inline uint32_t f32_to_bits(float x)
{
  uint32_t u; memcpy(&u,&x,4); return u;
}

static inline float f32_from_bits(uint32_t x)
{
  float u; memcpy(&u,&x,4); return u;
}

// if 'v' is float and 's' is all clear (except sign bit)
static inline float f32_mulsign(float v, uint32_t s)
{
  return f32_from_bits(f32_to_bits(v)^s);
}


//********************************************************
// variants based on:
// "Fast Calculation of Cube and Inverse Cube Roots Using
// a Magic Constant and Its Implementation on Microcontrollers"
// Moroz, Samotyy, Walczyk, Cieslinski, 2021
// SEE: algorithm 5 (A5) and algorithm 6


// common computation for x^(1/3) and x^(-1/3)
//   x  is positive
//   xi is IEEE bit pattern of x
//
// (reworked core portion of A5/A6 by Moroz,
//  Samotyy, Walczyk, Cieslinski)
//
// (1) unsigned div by constant faster than signed
// (2) breaks dependences up (y*x) & (y*y) can be
//     'in-flight' at the same time. The other
//     two choices the product chain is serial.
static inline float f32_cbrt_ki(float x, uint32_t xi)
{
  // authors optimized Halley's method first step
  // given their "magic" constant (in 1)
  const float c0 =  0x1.c09806p0f;
  const float c1 = -0x1.403e6cp0f;
  const float c2 =  0x1.04cdb2p-1f;

  // initial guess
  xi = 0x548c2b4b - xi/3;               // (1)

  // modified Halley's method
  float y = f32_from_bits(xi);
  float c = (x*y)*(y*y);                // (2)

  y = y*fmaf(c, fmaf(c2,c,c1), c0);

  return y;
}

// Newton step:
//   y = 1/cbrt(x)
//   f(y)    = 1/y^3 - x
//   f'(y)   = -3/y^4
//   y_{n+1} = y - f(y)/f'(y)
//           = y + 1/3 (1/y^3 - x) y^4
//           = y + 1/3 (y - xy^4)
//           = y(1 + 1/3 (1 - xy^3))
//           = y(1 + 1/3 c)  : c = 1 - xy^3
//           = y + 1/3 yc

// SEE: https://marc-b-reynolds.github.io/math/2019/03/12/FpDiv.html
// RN(1/3) = RU(1/3)
static const float f32_third    =  0x1.555556p-2f;
static const float f32_third_l  = -0x1.555556p-27f;

// 1/cbrt(x) newton step with correctly rounded div by 3
// y = y_n (estimate in)

static inline float f32_rcbrt_newton(float x, float y)
{
  float c = fmaf(x,y*y*y, -1.0f);                   // xy^3-1
  float t = fmaf(c, -f32_third, c*(-f32_third_l));  // -RN(c/3)

  return fmaf(y,t,y);
}

static inline float f32_rcbrt_newton_m(float x, float y)
{
  float c = fmaf(x*y,y*y, -1.0f);
  float t = fmaf(c, -f32_third, c*(-f32_third_l));

  return fmaf(y,t,y);
}

// 1/cbrt(x) newton step with div by 3 as mul by recip
// y = y_n (estimate in)
static inline float f32_rcbrt_newton_fast(float x, float y)
{
  float c = fmaf(x,y*y*y, -1.0f);

  y = y*fmaf(-f32_third,c,1.f);

  return y;
}

// dep-chain breakup: slightly hampers results
static inline float f32_rcbrt_newton_fast_m(float x, float y)
{
  float c = fmaf(x*y,y*y, -1.0f);

  y = y*fmaf(-f32_third,c,1.f);

  return y;
}

const double f64_third   = 0x1.5555555555555p-2;
const double f64_third_l = 0x1.5555555555555p-56;

// 1/cbrt(x) newton step with correctly rounded div by 3
// y = y_n (estimate in)
static inline double f64_rcbrt_newton(double x, double y)
{
  double c = fma(x,y*y*y, -1.0);
  double t = fma(c, -f64_third, c*(-f64_third_l));

  y = fma(y,t,y);

  return y;
}

static inline double f64_rcbrt_newton_m(double x, double y)
{
  double c = fma(x*y,y*y, -1.0);
  double t = fma(c, -f64_third, c*(-f64_third_l));

  y = fma(y,t,y);

  return y;
}

// 1/cbrt(x) newton step with div by 3 as mul by recip
// y = y_n (estimate in)
static inline double f64_rcbrt_newton_fast(double x, double y)
{
  double c = fma(x,y*y*y, -1.0);

  y = y*fma(-f64_third,c,1.0);

  return y;
}

static inline double f64_rcbrt_newton_fast_m(double x, double y)
{
  double c = fma(x*y,y*y, -1.0);

  y = y*fma(-f64_third,c,1.0);

  return y;
}

// Newton step:
//   y       = cbrt(x)
//   f(y)    = y^3 - x
//   f'(y)   = 3y^2
//   y_{n+1} = y - f(y)/f'(y)
//           = y - (y^3 - x)/3y^2


// cbrt(x) newton step. y = ~1/cbrt(x)
static inline float f32_cbrt_newton_fast_(float x, float y)
{
  float d = x*(y*y);
  float c = fmaf(d,y,-1.f);
  float t = fmaf(c, -f32_third, 1.f);
  y = d*(t*t);

  return y;
}

static inline float f32_cbrt_newton_fast(float x, float y)
{
  float a = x*y;
  float d = a*y;
  float c = fmaf(d,y,-1.f);
  float t = fmaf(c, -f32_third, 1.f);
  y = d*(t*t);

  return y;
}

static inline double f64_cbrt_newton_fast(double x, double y)
{
  double a = x*y;
  double d = a*y;
  double c = fma(d,y,-1.0);
  double t = fma(c, -f64_third, 1.0);
  y = d*(t*t);

  return y;
}

// temp hack (not reworked yet)
static inline float f32_cbrt_newton(float x, float y)
{
  float d = x*(y*y);
  float c = fmaf(d,y,-1.f);
  float s = fmaf(c, -f32_third, c*(-f32_third_l));  // -RN(c/3)
  float t = s + 1.f;
  y = d*(t*t);

  return y;
}

// ( currently unused )
// 1/cbrt(x) Halley step:
//   y_{n+1} = y (1 + c(1/3 + 2/9 c)) , c = 1-xy^3
//           = y (1 + (1/3c+ 2/9 c^2)),
//           = y (1 + (t + 2t^2)),   t = c/3
//           = y (1 + a),   t = c/3

static inline float f32_rcbrt_halley(float x, float y)
{
  float c = fmaf(x,y*y*y, -1.0f);                   // xy^3-1
  float t = fmaf(c, -f32_third, c*(-f32_third_l));  // -RN(c/3)
  float a = fmaf(2.f*t,t,t);
  return fmaf(y,a,y);
}

static inline float f32_rcbrt_halley_m(float x, float y)
{
  float c = fmaf(x*y,y*y, -1.0f);                   // xy^3-1
  float t = fmaf(c, -f32_third, c*(-f32_third_l));  // -RN(c/3)
  float a = fmaf(2.f*t,t,t);
  return fmaf(y,a,y);
}

//********************************************************
//

// expand a cbrt/rcbrt core method for positive only input
static inline float f32_cbrt_px(float x, float (*f)(float, uint32_t))
{
  return f(x, f32_to_bits(x));
}

// expand a cbrt/rcbrt core method for full range: cbrt(-x) = -cbrt(x)
static inline float f32_cbrt_fx(float x, float (*f)(float, uint32_t))
{
  uint32_t ix = f32_to_bits(x);     // bit pattern of x
  uint32_t ax = ix & 0x7fffffff;    // |x|     {bits}
  uint32_t sx = ix & 0x80000000;    // sign(x) {bit }
  float    a  = f32_from_bits(ax);  // |x|
  float    r  = f(a,ax);            // core approximation

  return f32_mulsign(r, sx);        // restore sign
}


//********************************************************
// cycle numbers spitball for full range expanded

// Author's corrected version (algorithm 6.1).
// (modified)
// ulp   3
// correctly:   52.266572% (13153314)
// faithfully:  45.591750% (11473540)
//   2 ulp:      2.141619% (538956)
//   3 ulp:      0.000060% (15)
static inline float f32_cbrt_k_1(float x, uint32_t ix)
{
  float y = f32_cbrt_ki(x,ix);

  return f32_cbrt_newton_fast(x,y);
}


// sqrt(x*x^(-1/3)) = sqrt(x^(2/3)) = x^(1/3)
// ulp   2
// correctly:   77.553369% (19516945)
// faithfully:  22.445829% (5648678)
//   2 ulp:      0.000803% (202)
static inline float f32_cbrt_k_2(float x, uint32_t ix)
{
  float y = f32_cbrt_ki(x,ix);

  return sqrtf(x * f32_rcbrt_newton_fast_m(x,y));
}

// ulp   1
// correctly:   80.880794% (20354319)
// faithfully:  19.119206% (4811506)
static inline float f32_cbrt_k_3(float x, uint32_t ix)
{
  float y = f32_cbrt_ki(x,ix);

  return sqrtf(x * f32_rcbrt_newton_m(x,y));
}

// correctly:   98.239267% (24722722)
// faithfully:   1.760733% (443103)
static inline float f32_cbrt_k_4(float x, uint32_t ix)
{
  double y = (double)f32_cbrt_ki(x,ix);
  double d = (double)x;

  y = f64_cbrt_newton_fast(d,y);

  return (float)y;
}

// correctly rounded : but questionable perf vs. fdlibm
static inline float f32_cbrt_k_5(float x, uint32_t ix)
{
  double y = (double)f32_cbrt_ki(x,ix);
  double d = (double)x;

  y = f64_rcbrt_newton_fast(d,y);
  y = f64_cbrt_newton_fast(d,y);

  return (float)y;
}


//********************************************************
// expand full range cbrt variants

float f32_cbrt_1(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_1); }
float f32_cbrt_2(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_2); }
float f32_cbrt_3(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_3); }
float f32_cbrt_4(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_4); }
float f32_cbrt_5(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_5); }
//float f32_cbrt_6(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_6); }
//float f32_cbrt_7(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_7); }

// expand postive input cbrt variants
float f32_cbrt_1p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_1); }
float f32_cbrt_2p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_2); }
float f32_cbrt_3p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_3); }
float f32_cbrt_4p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_4); }
float f32_cbrt_5p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_5); }
//float f32_cbrt_6p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_6); }
//float f32_cbrt_7p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_7); }


//********************************************************
// x^(-1/3) aka 1/cbrt(x)


// A5
// ulp   2
// correctly:   72.750709% (18308316)
// faithfully:  27.207556% (6847006)
//   2 ulp:      0.041735% (10503)
// ~17 cycles
static inline float f32_rcbrt_k_1(float x, uint32_t ix)
{
  float y = f32_cbrt_ki(x, ix);

  return f32_rcbrt_newton_fast(x,y);
//return f32_rcbrt_newton_fast_m(x,y);
}

// A5: modified by
// * correctly rounded divison by 3
// ulp   1
// correctly:   88.337028% (22230742)
// faithfully:  11.662972% (2935083)
// ~17 cycles
static inline float f32_rcbrt_k_2(float x, uint32_t ix)
{
  float y = f32_cbrt_ki(x,ix);

  return f32_rcbrt_newton(x,y);
//return f32_rcbrt_newton_m(x,y);
}

// ulp   1
// correctly:   99.150757% (24952106)
// faithfully:   0.849243% (213719)
static inline float f32_rcbrt_k_3(float x, uint32_t ix)
{
  double y = (double)f32_cbrt_ki(x,ix);
  double d = (double)x;

  y = f64_rcbrt_newton_fast(d,y);

  return (float)y;
}

// appears correctly rounded: also appears to underperform
// vs. locally compiled fdlibm version YMMV. fdlibm version
// performs 2 floating-point divisions
static inline float f32_rcbrt_k_4(float x, uint32_t ix)
{
  double y = (double)f32_cbrt_ki(x,ix);
  double d = (double)x;

  y = f64_rcbrt_newton_fast(d,y);
  y = f64_rcbrt_newton_fast(d,y);

  return (float)y;
}


//*****************************************************
// expand full/positive only 1/cbrt

float f32_rcbrt_1(float x)  { return f32_cbrt_fx(x, &f32_rcbrt_k_1); }
float f32_rcbrt_2(float x)  { return f32_cbrt_fx(x, &f32_rcbrt_k_2); }
float f32_rcbrt_3(float x)  { return f32_cbrt_fx(x, &f32_rcbrt_k_3); }
float f32_rcbrt_4(float x)  { return f32_cbrt_fx(x, &f32_rcbrt_k_4); }

float f32_rcbrt_1p(float x) { return f32_cbrt_px(x, &f32_rcbrt_k_1); }
float f32_rcbrt_2p(float x) { return f32_cbrt_px(x, &f32_rcbrt_k_2); }
float f32_rcbrt_3p(float x) { return f32_cbrt_px(x, &f32_rcbrt_k_3); }
float f32_rcbrt_4p(float x) { return f32_cbrt_px(x, &f32_rcbrt_k_4); }

//*****************************************************

// for fast testing
float lib_rcbrt_dp(float x)
{
  return (float)(1.0/cbrt((double)x));
}


// compiler provided libm cbrtf
// ulp   1
// correctly:   89.394180% (22496783)
// faithfully:  10.605820% (2669042)
// ~60.991279 cycle
float lib_cbrtf(float x)
{
  return cbrtf(x);
}

// ulp   1
// correctly:   85.348766% (21478721)
// faithfully:  14.651234% (3687104)
// ~35.4 cycles
// WARNING:  The error skyrockets for small/large inputs
// (well up to 15 ulp error)
float lol_cbrt(float x)
{
  return powf(x, 1.f/3.f);
}

// compiler provided libm cbrtf + divide
// ulp   2
// correctly:   72.071696% (18137437)
// faithfully:  27.859262% (7011013)
//   2 ulp:      0.069042% (17375)
// ~ 64 cycles
float lib_rcbrt(float x)
{
  return 1.f/cbrtf(x);
}

// this is much better than I expected
// ulp   1
// correctly:   88.660451% (22312134)
// faithfully:  11.339549% (2853691)
// ~35.4 cycles
// WARNING:  The error skyrockets for small/large inputs
// (well up to 15 ulp error)
// NOTE: positive only as written
float lol_rcbrt(float x)
{
  return powf(x, -f32_third);
}

//**************************** start of musl

// correctly rounded : ~26.540119 cycles

static const unsigned
B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */

float fdlibm_cbrtf(float x)
{
  double_t r,T;
  union {float f; uint32_t i;} u = {x};
  uint32_t hx = u.i & 0x7fffffff;

  if (hx >= 0x7f800000)  /* cbrt(NaN,INF) is itself */
    return x + x;

  /* rough cbrt to 5 bits */
  if (hx < 0x00800000) {  /* zero or subnormal? */
    if (hx == 0)
      return x;  /* cbrt(+-0) is itself */
    u.f = x*0x1p24f;
    hx = u.i & 0x7fffffff;
    hx = hx/3 + B2;
  } else {
    hx = hx/3 + B1;
  }

  u.i &= 0x80000000;
  u.i |= hx;

  /*
   * First step Newton iteration (solving t*t-x/t == 0) to 16 bits.  In
   * double precision so that its terms can be arranged for efficiency
   * without causing overflow or underflow.
   */
  T = u.f;
  r = T*T*T;
  T = T*((double_t)x+x+r)/(x+r+r);

  /*
   * Second step Newton iteration to 47 bits.  In double precision for
   * efficiency and accuracy.
   */
  r = T*T*T;
  T = T*((double_t)x+x+r)/(x+r+r);

  /* rounding to 24 bits is perfect in round-to-nearest mode */
  return (float)T;
}

//**************************** end of musl

//********************************************************
// driver and testing junk below here

#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))

typedef struct {
  float (*f)(float);
  char*   name;
} func_entry_t;

#define STRINGIFY(S)  STRINGIFY_(S)
#define STRINGIFY_(S) #S
#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }


func_entry_t rcbrt_table[] =
{
#if !defined(WIP_RCBRT)

  // this is only for validating it's OK
#if !defined(USE_DOUBLES_LIKE_A_CHEATER)
  ENTRY(lib_rcbrt_dp),
#endif

  ENTRY(lib_rcbrt),

#if defined(TEST_FULL_RANGE)
  ENTRY(f32_rcbrt_1),
  ENTRY(f32_rcbrt_2),
  ENTRY(f32_rcbrt_3),
  ENTRY(f32_rcbrt_4),
#else
  ENTRY(lol_rcbrt),
  ENTRY(f32_rcbrt_1p),
  ENTRY(f32_rcbrt_2p),
  ENTRY(f32_rcbrt_3p),
  ENTRY(f32_rcbrt_4p),
#endif

#else
  ENTRY(WIP_RCBRT)
#endif
};

func_entry_t cbrt_table[] =
{
#if !defined(WIP_CBRT)

#if defined(TEST_FULL_RANGE)
  ENTRY(lib_cbrtf),
  ENTRY(fdlibm_cbrtf),
  ENTRY(f32_cbrt_1),
  ENTRY(f32_cbrt_2),
  ENTRY(f32_cbrt_3),
  ENTRY(f32_cbrt_4),
  ENTRY(f32_cbrt_5),
  //ENTRY(f32_cbrt_6),
  //ENTRY(f32_cbrt_7),
#else
  ENTRY(lol_cbrt),
  ENTRY(f32_cbrt_1p),
  ENTRY(f32_cbrt_2p),
  ENTRY(f32_cbrt_3p),
  ENTRY(f32_cbrt_4p),
  ENTRY(f32_cbrt_5p),
  //ENTRY(f32_cbrt_6),
  //ENTRY(f32_cbrt_7),
#endif

#else
  ENTRY(WIP_CBRT)
#endif
};


#undef ENTRY

//********************************************************


static inline uint32_t u32_abs(uint32_t x)
{
  return (int32_t)x >= 0 ? x : -x;
}

// ulp distance provided a & b are finite
// and have the same sign
static inline uint32_t f32_ulp_dist_ss(float a, float b)
{
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);
  return u32_abs(ua-ub);
}

//**************************** start of core-math
// SEE: https://core-math.gitlabpages.inria.fr
// and license info at top of file.

// oh my!
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wpedantic"
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
#pragma GCC diagnostic ignored "-Wimplicit-float-conversion"
#pragma GCC diagnostic ignored "-Wimplicit-int-conversion"
#pragma GCC diagnostic ignored "-Wfloat-conversion"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"

#define INEXACTFLAG 0

typedef union {float f; uint32_t u;} b32u32_u;
typedef union {double f; uint64_t u;} b64u64_u;


float cr_cbrtf (float x){
  static const double escale[3] = {1.0, 0x1.428a2f98d728bp+0/* 2^(1/3) */, 0x1.965fea53d6e3dp+0/* 2^(2/3) */};
#if INEXACTFLAG!=0
  volatile uint32_t flag = _mm_getcsr(); /* store MXCSR Control/Status Register */
#endif
  b32u32_u cvt0 = {.f = x};
  uint32_t hx = cvt0.u, ix = 0x7fffffff&hx, e = ix>>23, mant = hx&0x7fffff;
  long sign = hx>>31;
  if(__builtin_expect(((e+1)&0xff)<2, 0)){
    if(e==0xff||ix==0) return x + x; /* 0, inf, nan */
    int nz = __builtin_clz(ix) - 8;  /* denormals */
    mant <<= nz;
    mant &= 0x7fffff;
    e -= nz - 1;
  }
  e += 899;
  b64u64_u cvt1 = {.u = (uint64_t)mant<<29|(0x3fful<<52)};
  uint32_t et = e/3, it = e%3;
  uint64_t isc = ((const uint64_t*)escale)[it];
  isc += (long)(et - 342)<<52;
  isc |= sign<<63;
  b64u64_u cvt2 = {.u = isc};
  static const double c[] = {0x1.1b0babccfef9cp-1, 0x1.2c9a3e94d1da5p-1, -0x1.4dc30b1a1ddbap-3, 0x1.7a8d3e4ec9b07p-6};
  const double u0 = 0x1.5555555555555p-2, u1 = 0x1.c71c71c71c71cp-3, u2 = 0x1.61f9add3c0ca4p-3;
  double z = cvt1.f, r = 1/z, z2 = z*z;
  double c0 = c[0] + z*c[1], c2 = c[2] + z*c[3], y = c0 + z2*c2, y2 = y*y;
  double w0 = y*u0, w1 = y*u1, w2 = y*u2;
  double h = y2*(y*r) - 1, h2 = h*h;
  y -= h*((w0 - w1*h) + w2*h2);
  y *= cvt2.f;
  b64u64_u cvt3 = {.f = y};
  float yf = y;
  long m0 = cvt3.u<<19, m1 = m0>>63;
  if(__builtin_expect((m0^m1)<(1l<<31),0)){
    b64u64_u cvt4 = {.u = (cvt3.u + (1ul<<31))&0xffffffff00000000ul};
    yf = cvt4.f;
#if INEXACTFLAG!=0
    _mm_setcsr(flag); /* restore MXCSR Control/Status Register for exact roots to get rid of the inexact flag if risen inside the function */
#endif
  }
  return yf;
}

#pragma GCC diagnostic pop

//**************************** end of core-math

void cbrt_test(func_entry_t* entry)
{
  // with x on [1,8] -> cbrt(x) on [1,2]
  // assumming test_start_value = 1. otherwise likewise
  // scaled to cover all significand bit pattern range on
  // the output.
  const uint32_t xs = f32_to_bits(test_start_value);
  const uint32_t xe = f32_to_bits(8.f*test_start_value);

  const uint32_t total = xe-xs+1;

#if defined(USE_MPFR)
  mpfr_t mx,mr;

  mpfr_init2(mx, 24);
  mpfr_init2(mr, 24);
#endif

  uint32_t error_u = 0;
  uint32_t error_cnt = 0;

  printf("starting: %s on [%08x, %08x]\n", entry->name, xs,xe);

  uint32_t error[max_ulp_dist+1] = {0};

  float (*f)(float) = entry->f;

  for(uint32_t xi=xs; xi<=xe; xi++) {
    float x  = f32_from_bits(xi);

    // compute correctly rounded
#if defined(USE_MPFR)
    mpfr_set_flt(mx, x,  MPFR_RNDN);          // no error
    mpfr_cbrt(mr, mx, MPFR_RNDN);             // RN(x^(1/3))

    float cr = mpfr_get_flt(mr, MPFR_RNDN);   // no error
#else
    float cr = cr_cbrtf(x);                   // core-math correctly rounded
#endif

    float r0 = f( x);

#if defined(TEST_FULL_RANGE)
    float r1 = f(-x);

    if (r0 != -r1) {
      printf("sign handling broken %a %a\n", r0,r1);
      exit(0);
    }
#endif

    if (r0 == cr)
      error[0]++;
    else {
      uint32_t u0 = f32_ulp_dist_ss(cr,r0);

      // track it if small enough for the buckets
      if (u0 <= max_ulp_dist)
	error[u0]++;
      else
	error_cnt++;

      if (u0 > error_u) error_u = u0;
    }
  }

#if defined(USE_MPFR)
  mpfr_clear(mr);
  mpfr_clear(mx);
#endif

  printf("// ulp   %d\n", error_u);
  printf("// correctly:  %10f%% (%d)\n", 100.0*(double)(error[0])/(double)(total), error[0]);

  if (error[1])
    printf("// faithfully: %10f%% (%d)\n", 100.0*(double)(error[1])/(double)(total), error[1]);

  for(uint32_t i=2; i<=max_ulp_dist; i++)
    if (error[i])
      printf("// %3d ulp:    %10f%% (%d)\n", i,100.0*(double)(error[i])/(double)(total), error[i]);

  if (error_cnt)
    printf("//  >%d ulp:    %10f%% (%d)\n", max_ulp_dist,100.0*(double)(error_cnt)/(double)(total), error_cnt);

  printf("\n");
}


void rcbrt_test(func_entry_t* entry)
{
  // with x on [1,8] -> 1/cbrt(x) on [1/2,1]
  // assumming test_start_value = 1. otherwise likewise
  // scaled to cover all significand bit pattern range on
  // the output.
  const uint32_t xs = f32_to_bits(test_start_value);
  const uint32_t xe = f32_to_bits(8.f*test_start_value);

  const uint32_t total = xe-xs+1;

#if !defined(USE_DOUBLES_LIKE_A_CHEATER)
  mpfr_t m1,mx,mr;

  mpfr_init2(m1, 24);
  mpfr_init2(mx, 128);
  mpfr_init2(mr, 24);

  mpfr_set_flt(m1, 1.f, MPFR_RNDN);
#endif

  uint32_t error_u = 0;
  uint32_t error_cnt = 0;

  printf("starting: %s on [%08x, %08x]\n", entry->name, xs,xe);

  uint32_t error[max_ulp_dist+1] = {0};

  float (*f)(float) = entry->f;

  for(uint32_t xi=xs; xi<=xe; xi++) {
    float x  = f32_from_bits(xi);

#if !defined(USE_DOUBLES_LIKE_A_CHEATER)
    // compute correctly rounded (well..sloppy actually)
    // I "should" write a test that checks if the MP
    // computation rounded up & rounded down both round
    // to same single precision value.
    mpfr_set_flt(mx, x,  MPFR_RNDN);        // no error
    mpfr_cbrt(mx, mx, MPFR_RNDN);           // t = RN(x^(1/3)) {128 bits}
    mpfr_div(mr,m1,mx, MPFR_RNDN);          // r = RN(1/t)     {128 bits}

    float cr = mpfr_get_flt(mr, MPFR_RNDN); // RN(t) {single precision}
#else
    float cr = lib_rcbrt_dp(x);
#endif

    float r0 = f(x);

#if defined(TEST_FULL_RANGE)
    float r1 = f(-x);

    if (r0 != -r1) {
      printf("sign handling broken %a %a\n", r0,r1);
      exit(0);
    }
#endif

    if (r0 == cr)
      error[0]++;
    else {
      uint32_t u0 = f32_ulp_dist_ss(cr,r0);

      // track it if small enough for the buckets
      if (u0 <= max_ulp_dist)
	error[u0]++;
      else
	error_cnt++;

      if (u0 > error_u) error_u = u0;
    }
  }

#if !defined(USE_DOUBLES_LIKE_A_CHEATER)
  mpfr_clear(mr);
  mpfr_clear(mx);
#endif

  printf("// ulp   %d\n", error_u);
  printf("// correctly:  %10f%% (%d)\n", 100.0*(double)(error[0])/(double)(total), error[0]);

  if (error[1])
    printf("// faithfully: %10f%% (%d)\n", 100.0*(double)(error[1])/(double)(total), error[1]);

  for(uint32_t i=2; i<=max_ulp_dist; i++)
    if (error[i])
      printf("// %3d ulp:    %10f%% (%d)\n", i,100.0*(double)(error[i])/(double)(total), error[i]);

  if (error_cnt)
    printf("//  >%d ulp:    %10f%% (%d)\n", max_ulp_dist,100.0*(double)(error_cnt)/(double)(total), error_cnt);

  printf("\n");
}

//********************************************************

int main(void)
{
  // compile time macros to bypass
#if !defined(NO_RCBRT)
  for(uint32_t i=0; i<LENGTHOF(rcbrt_table); i++)
    rcbrt_test(rcbrt_table+i);
#endif

#if !defined(NO_CBRT)
  for(uint32_t i=0; i<LENGTHOF(cbrt_table); i++)
    cbrt_test(cbrt_table+i);
#endif

  return 0;
}


## summary.md

      
    Raw
  

              summary.md
            
          
    Does not consider denormal inputs
reciprocal cube root (x^-⅓)


function
cycles
max
CR
FR
>ULP
Notes


lib_rcbrt
63.0
2
72.071696%
27.859262%
0.069042%
glibc 1/cbrtf


f32_rcbrt_1
16.2
2
72.746727%
27.211490%
0.041783%
paper version w mods


f32_rcbrt_2
17.5
1
88.337028%
11.662972%
---
CR div by 3


f32_rcbrt_3
20.1
1
99.150757%
0.849243%
---
double promote


f32_rcbrt_4
26.1
0
100.000000%
---
---
double promote + extra step


cube root (x^⅓)


function
cycles
max
CR
FR
>ULP
Notes


lib_cbrtf
60.6
1
89.394180%
10.605820%
---
glib


fdlibm_cbrtf
26.2
0
100.000000%
---
---
in source copy


f32_cbrt_1
17.3
3
52.266572%
45.591750%
2.141679%
author's 6.1 with mods


f32_cbrt_2
18.5
2
77.553369%
22.445829%
0.000803%
sqrt variant


f32_cbrt_3
19.7
1
80.880794%
19.119206%
---
sqrt + CR div by 3


f32_cbrt_4
21.3
1
98.239267%
1.760733%
---
double promote


f32_cbrt_5
28.2
0
100.000000%
---
---
double promote + extra step


For large & small 'x' ULP error grows to 15. However results are tighter in the neighboorhold on '1'
since paper's 1/cbrt(x) has much tighter results than their cbrt(x) I make a left turn and use  x*x^-⅓ = x^-⅔ and then use sqrt.


cycles = estimate of throughput CPU cycles (lies, damn lies and benchmarks. don't trust this at all)
max = max ULP error (integer distance)
CR/CR/>ULP = percentage of results that are correctly, faithfully and beyond rounded.
function	cycles	max	CR	FR	>ULP	Notes
lib_rcbrt	63.0	2	72.071696%	27.859262%	0.069042%	glibc 1/cbrtf
f32_rcbrt_1	16.2	2	72.746727%	27.211490%	0.041783%	paper version w mods
f32_rcbrt_2	17.5	1	88.337028%	11.662972%	---	CR div by 3
f32_rcbrt_3	20.1	1	99.150757%	0.849243%	---	double promote
f32_rcbrt_4	26.1	0	100.000000%	---	---	double promote + extra step
function	cycles	max	CR	FR	>ULP	Notes
lib_cbrtf	60.6	1	89.394180%	10.605820%	---	glib
fdlibm_cbrtf	26.2	0	100.000000%	---	---	in source copy
f32_cbrt_1	17.3	3	52.266572%	45.591750%	2.141679%	author's 6.1 with mods
f32_cbrt_2	18.5	2	77.553369%	22.445829%	0.000803%	sqrt variant
f32_cbrt_3	19.7	1	80.880794%	19.119206%	---	sqrt + CR div by 3
f32_cbrt_4	21.3	1	98.239267%	1.760733%	---	double promote
f32_cbrt_5	28.2	0	100.000000%	---	---	double promote + extra step