Marc-B-Reynolds/tanpi_hack.c

## tanpi_hack.c
// Public Domain under http://unlicense.org, see link for details.

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>

#include "xmmintrin.h"
#include "wmmintrin.h"
#include "x86intrin.h"

#define RANDOMIZE

// number of trials
#define TLEN UINT64_C(0x000000000fffffff)

// gen 'x' on +/-RANGE
#define RANGE 1.f


// use
#define SHORTER

// external code: xoroshiro128+

inline uint32_t f32_to_bits(float x)
{
  uint32_t u; memcpy(&u,&x,4); return u;
}

inline float f32_from_bits(uint32_t x)
{
  float u; memcpy(&u,&x,4); return u;
}

inline uint64_t f64_to_bits(double x)
{
  uint64_t u; memcpy(&u,&x,8); return u;
}

inline double f64_from_bits(uint64_t x)
{
  double d; memcpy(&d,&x,8); return d;
}

//
inline uint64_t u64_abs(uint64_t x)
{
  return (int64_t)x >= 0 ? x : -x;
}
inline uint32_t u32_abs(uint32_t x)
{
  return (int32_t)x >= 0 ? x : -x;
}


// ulp distance provided a & b are finite
// and have the same sign
inline uint64_t f64_ulp_dist_ss(double a, double b)
{
  uint64_t ua = f64_to_bits(a);
  uint64_t ub = f64_to_bits(b);
  return u64_abs(ua-ub);
}

inline uint32_t f32_ulp_dist_ss(float a, float b)
{
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);
  return u32_abs(ua-ub);
}


inline uint32_t f32_ulp_dist(float a, float b)
{
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);
  uint32_t s  = ub^ua;

  if ((int32_t)s >= 0)
    return u32_abs(ua-ub);

  return ua+ub+0x80000000;
}

//*******************

uint64_t rng_state[2];

inline uint64_t rotl(const uint64_t v, int i)
{
  return (v << i)|(v >> (64-i));
}

inline uint64_t rng_u64(void)
{
  uint64_t s0 = rng_state[0];
  uint64_t s1 = rng_state[1];
  uint64_t r  = s0 + s1;

  s1 ^= s0;
  rng_state[0] = rotl(s0,55) ^ s1 ^ (s1<<14);
  rng_state[1] = rotl(s1,36);

  return r;
}
// end: xoroshiro128+

// unbiased uniform signed integer (in floating-point) [-2^p,2^p-1]
inline float  rng_f32_si() { return (float) ((int64_t)rng_u64() >> (64-25)); }
inline double rng_f64_si() { return (double)((int64_t)rng_u64() >> (64-54)); }

// unbiased uniform floating-point values on [-1,1)
inline float  rng_f32_s()  { return rng_f32_si() * 0x1p-24f; }
inline double rng_f64_s()  { return rng_f64_si() * 0x1p-53;  }

// unbiased uniform floating-point values on (-1,1]
inline float  rng_f32_sez() { return fmaf(rng_f32_si(), 0x1p-24f, 0x1p-24f); }
inline double rng_f64_sez() { return fma (rng_f64_si(), 0x1p-53,  0x1p-53 ); }


void reset_generators(uint64_t s0, uint64_t s1)
{
  rng_state[0] = s0;
  rng_state[1] = s1;
  rng_u64();
}

typedef struct {float a,b; } f32_pair_t;

inline float f32_xor(float a, float b) { return f32_from_bits(f32_to_bits(a) ^ f32_to_bits(b)); }

inline float f32_mulsign(float v, uint32_t s) { return f32_from_bits(f32_to_bits(v)^s); }


#if defined(SHORTER)
// constants for sin(pi x) and cos(pi x) for x on [-1/4,1/4]
const float f32_sinpi_7_k[] = { 0x1.921fb6p1f,  -0x1.4abbecp2f, 0x1.466b2p1f,  -0x1.2f5992p-1f };
const float f32_cospi_6_k[] = { 0x1.fffffep-1f, -0x1.3bd37ep2f, 0x1.03acccp2f, -0x1.4dfd3ap0f  };

void f32_sincospi(float* dst, float a)
{
  static const float* S = f32_sinpi_7_k;
  static const float* C = f32_cospi_6_k;

  float    c,s,a2,a3,r;
  uint32_t q,sx,sy;

  r  = rintf(a+a);
  a  = fmaf(r,-0.5f,a);
  q  = (uint32_t)r;
  a2 = a*a;
  sy = (q<<31); sx = (q>>1)<<31; sy ^= sx; q &= 1;

  c  = fmaf(C[3], a2, C[2]); s = fmaf(S[3], a2, S[2]); a3 = a2*a;
  c  = fmaf(c,    a2, C[1]); s = fmaf(s,    a2, S[1]);
  c  = fmaf(c,    a2, C[0]); s = a3 * s;
  c  = f32_mulsign(c,sx);    s = fmaf(a,    S[0], s);
                             s = f32_mulsign(s,sy);

  dst[q  ] = c;
  dst[q^1] = s;
}
#else
// constants for sin(pi x) and cos(pi x) for x on [-1/4,1/4]
const float f32_sinpi_7_k[] = { 0x1.921fb6p1f, -0x1.4abbecp2f,  0x1.466b2p1f, -0x1.2f5992p-1f };
const float f32_cospi_8_k[] = {-0x1.3bd3ccp2f,  0x1.03c1b8p2f, -0x1.55b7cep0f, 0x1.d684aap-3f };


// dst[0] = cos(pi a), dst[1] = sin(pi a)
void f32_sincospi(float* dst, float a)
{
  static const float* S = f32_sinpi_7_k;
  static const float* C = f32_cospi_8_k;

  float    c,s,a2,a3,r;
  uint32_t q,sx,sy;

  // range reduce
  r  = rintf(a+a);
  a  = fmaf(r,-0.5f,a);

  // setting up for reconstruct
  q  = (uint32_t)r;
  sy = (q<<31); sx = (q>>1)<<31; sy ^= sx; q &= 1;

  // compute the approximations
  a2 = a*a;
  c  = fmaf(C[3], a2, C[2]); s = fmaf(S[3], a2, S[2]); a3 = a2*a;
  c  = fmaf(c,    a2, C[1]); s = fmaf(s,    a2, S[1]);
  c  = fmaf(c,    a2, C[0]); s = a3 * s;
  c  = fmaf(c,    a2,  1.f); s = fmaf(a,    S[0], s);
  c  = f32_mulsign(c,sx);    s = f32_mulsign(s,sy);

  dst[q  ] = c;
  dst[q^1] = s;
}
#endif


#define HISTO_LEN 16

#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))


void tanfoo()
{
  uint32_t histo[HISTO_LEN] = {0};

  for(uint64_t i=0; i<TLEN; i++) {
    f32_pair_t p;
    float x0 = RANGE*(rng_f32_s());
    float t0;
    float t1;

    // just cheating a bit and treating 'double' computation as
    // correct. one gotch here is integer multiples of pi/2,
    // the double version can't do that and the approximation
    // well be correct.
    double x1 = M_PI*x0;
    t1 = (float)tan(x1);
    f32_sincospi((float*)&p,x0);
    t0 = p.b/p.a;

    uint32_t diff = f32_ulp_dist(t0,t1);
    uint32_t id   = (diff < HISTO_LEN) ? diff : HISTO_LEN-1;

    histo[id]++;

    // dump first 'n' example of each ulp diff
    if (histo[id] < 10) {

      // double's can't represent multiples of (pi/2) so filter out when 'x0' is.
      // (the issue is no tanpi function)
      if ((id != (HISTO_LEN-1)) || (floorf(x0+x0) != ceilf(x0+x0)))
	printf("%14f | % 15e % 15e | %15a %15a | %5u | %016lx%016lx\n", x0, t1,t0, t1,t0, diff,rng_state[0],rng_state[1]);
    }
  }

  printf("\ntan {");
  for(uint32_t i=0; i<HISTO_LEN; i++) {
    printf("%u,", histo[i]);
  }
  printf("}\n");
}


int main(void)
{
  uint64_t s0;
  uint64_t s1;

  s0  = UINT64_C(0xcb9c59b3f9f87d4d);
  s1  = UINT64_C(0x2545f4914f6cdd1d);
  s0 ^= _rdtsc();

  float t = -0.5f;

  printf("%f %f : %u\n", floorf(t+t),ceilf(t+t), floorf(t+t) != ceilf(t+t));


  printf("seed: %016lx:%016lx\n\n",s1,s0);
  printf("%14s | %15s %15s | %15s %15s | %5s\n","x","ref","approx","ref","approx","diff");

  reset_generators(s0,s1);

  tanfoo();

  return 0;
}
	// Public Domain under http://unlicense.org, see link for details.

	#include <stdint.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <math.h>
	#include <string.h>

	#include "xmmintrin.h"
	#include "wmmintrin.h"
	#include "x86intrin.h"

	#define RANDOMIZE

	// number of trials
	#define TLEN UINT64_C(0x000000000fffffff)

	// gen 'x' on +/-RANGE
	#define RANGE 1.f


	// use
	#define SHORTER

	// external code: xoroshiro128+

	inline uint32_t f32_to_bits(float x)
	{
	uint32_t u; memcpy(&u,&x,4); return u;
	}

	inline float f32_from_bits(uint32_t x)
	{
	float u; memcpy(&u,&x,4); return u;
	}

	inline uint64_t f64_to_bits(double x)
	{
	uint64_t u; memcpy(&u,&x,8); return u;
	}

	inline double f64_from_bits(uint64_t x)
	{
	double d; memcpy(&d,&x,8); return d;
	}

	//
	inline uint64_t u64_abs(uint64_t x)
	{
	return (int64_t)x >= 0 ? x : -x;
	}
	inline uint32_t u32_abs(uint32_t x)
	{
	return (int32_t)x >= 0 ? x : -x;
	}


	// ulp distance provided a & b are finite
	// and have the same sign
	inline uint64_t f64_ulp_dist_ss(double a, double b)
	{
	uint64_t ua = f64_to_bits(a);
	uint64_t ub = f64_to_bits(b);
	return u64_abs(ua-ub);
	}

	inline uint32_t f32_ulp_dist_ss(float a, float b)
	{
	uint32_t ua = f32_to_bits(a);
	uint32_t ub = f32_to_bits(b);
	return u32_abs(ua-ub);
	}


	inline uint32_t f32_ulp_dist(float a, float b)
	{
	uint32_t ua = f32_to_bits(a);
	uint32_t ub = f32_to_bits(b);
	uint32_t s = ub^ua;

	if ((int32_t)s >= 0)
	return u32_abs(ua-ub);

	return ua+ub+0x80000000;
	}

	//*******************

	uint64_t rng_state[2];

	inline uint64_t rotl(const uint64_t v, int i)
	{
	return (v << i)\|(v >> (64-i));
	}

	inline uint64_t rng_u64(void)
	{
	uint64_t s0 = rng_state[0];
	uint64_t s1 = rng_state[1];
	uint64_t r = s0 + s1;

	s1 ^= s0;
	rng_state[0] = rotl(s0,55) ^ s1 ^ (s1<<14);
	rng_state[1] = rotl(s1,36);

	return r;
	}
	// end: xoroshiro128+

	// unbiased uniform signed integer (in floating-point) [-2^p,2^p-1]
	inline float rng_f32_si() { return (float) ((int64_t)rng_u64() >> (64-25)); }
	inline double rng_f64_si() { return (double)((int64_t)rng_u64() >> (64-54)); }

	// unbiased uniform floating-point values on [-1,1)
	inline float rng_f32_s() { return rng_f32_si() * 0x1p-24f; }
	inline double rng_f64_s() { return rng_f64_si() * 0x1p-53; }

	// unbiased uniform floating-point values on (-1,1]
	inline float rng_f32_sez() { return fmaf(rng_f32_si(), 0x1p-24f, 0x1p-24f); }
	inline double rng_f64_sez() { return fma (rng_f64_si(), 0x1p-53, 0x1p-53 ); }


	void reset_generators(uint64_t s0, uint64_t s1)
	{
	rng_state[0] = s0;
	rng_state[1] = s1;
	rng_u64();
	}

	typedef struct {float a,b; } f32_pair_t;

	inline float f32_xor(float a, float b) { return f32_from_bits(f32_to_bits(a) ^ f32_to_bits(b)); }

	inline float f32_mulsign(float v, uint32_t s) { return f32_from_bits(f32_to_bits(v)^s); }


	#if defined(SHORTER)
	// constants for sin(pi x) and cos(pi x) for x on [-1/4,1/4]
	const float f32_sinpi_7_k[] = { 0x1.921fb6p1f, -0x1.4abbecp2f, 0x1.466b2p1f, -0x1.2f5992p-1f };
	const float f32_cospi_6_k[] = { 0x1.fffffep-1f, -0x1.3bd37ep2f, 0x1.03acccp2f, -0x1.4dfd3ap0f };

	void f32_sincospi(float* dst, float a)
	{
	static const float* S = f32_sinpi_7_k;
	static const float* C = f32_cospi_6_k;

	float c,s,a2,a3,r;
	uint32_t q,sx,sy;

	r = rintf(a+a);
	a = fmaf(r,-0.5f,a);
	q = (uint32_t)r;
	a2 = a*a;
	sy = (q<<31); sx = (q>>1)<<31; sy ^= sx; q &= 1;

	c = fmaf(C[3], a2, C[2]); s = fmaf(S[3], a2, S[2]); a3 = a2*a;
	c = fmaf(c, a2, C[1]); s = fmaf(s, a2, S[1]);
	c = fmaf(c, a2, C[0]); s = a3 * s;
	c = f32_mulsign(c,sx); s = fmaf(a, S[0], s);
	s = f32_mulsign(s,sy);

	dst[q ] = c;
	dst[q^1] = s;
	}
	#else
	// constants for sin(pi x) and cos(pi x) for x on [-1/4,1/4]
	const float f32_sinpi_7_k[] = { 0x1.921fb6p1f, -0x1.4abbecp2f, 0x1.466b2p1f, -0x1.2f5992p-1f };
	const float f32_cospi_8_k[] = {-0x1.3bd3ccp2f, 0x1.03c1b8p2f, -0x1.55b7cep0f, 0x1.d684aap-3f };


	// dst[0] = cos(pi a), dst[1] = sin(pi a)
	void f32_sincospi(float* dst, float a)
	{
	static const float* S = f32_sinpi_7_k;
	static const float* C = f32_cospi_8_k;

	float c,s,a2,a3,r;
	uint32_t q,sx,sy;

	// range reduce
	r = rintf(a+a);
	a = fmaf(r,-0.5f,a);

	// setting up for reconstruct
	q = (uint32_t)r;
	sy = (q<<31); sx = (q>>1)<<31; sy ^= sx; q &= 1;

	// compute the approximations
	a2 = a*a;
	c = fmaf(C[3], a2, C[2]); s = fmaf(S[3], a2, S[2]); a3 = a2*a;
	c = fmaf(c, a2, C[1]); s = fmaf(s, a2, S[1]);
	c = fmaf(c, a2, C[0]); s = a3 * s;
	c = fmaf(c, a2, 1.f); s = fmaf(a, S[0], s);
	c = f32_mulsign(c,sx); s = f32_mulsign(s,sy);

	dst[q ] = c;
	dst[q^1] = s;
	}
	#endif


	#define HISTO_LEN 16

	#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))


	void tanfoo()
	{
	uint32_t histo[HISTO_LEN] = {0};

	for(uint64_t i=0; i<TLEN; i++) {
	f32_pair_t p;
	float x0 = RANGE*(rng_f32_s());
	float t0;
	float t1;

	// just cheating a bit and treating 'double' computation as
	// correct. one gotch here is integer multiples of pi/2,
	// the double version can't do that and the approximation
	// well be correct.
	double x1 = M_PI*x0;
	t1 = (float)tan(x1);
	f32_sincospi((float*)&p,x0);
	t0 = p.b/p.a;

	uint32_t diff = f32_ulp_dist(t0,t1);
	uint32_t id = (diff < HISTO_LEN) ? diff : HISTO_LEN-1;

	histo[id]++;

	// dump first 'n' example of each ulp diff
	if (histo[id] < 10) {

	// double's can't represent multiples of (pi/2) so filter out when 'x0' is.
	// (the issue is no tanpi function)
	if ((id != (HISTO_LEN-1)) \|\| (floorf(x0+x0) != ceilf(x0+x0)))
	printf("%14f \| % 15e % 15e \| %15a %15a \| %5u \| %016lx%016lx\n", x0, t1,t0, t1,t0, diff,rng_state[0],rng_state[1]);
	}
	}

	printf("\ntan {");
	for(uint32_t i=0; i<HISTO_LEN; i++) {
	printf("%u,", histo[i]);
	}
	printf("}\n");
	}


	int main(void)
	{
	uint64_t s0;
	uint64_t s1;

	s0 = UINT64_C(0xcb9c59b3f9f87d4d);
	s1 = UINT64_C(0x2545f4914f6cdd1d);
	s0 ^= _rdtsc();

	float t = -0.5f;

	printf("%f %f : %u\n", floorf(t+t),ceilf(t+t), floorf(t+t) != ceilf(t+t));


	printf("seed: %016lx:%016lx\n\n",s1,s0);
	printf("%14s \| %15s %15s \| %15s %15s \| %5s\n","x","ref","approx","ref","approx","diff");

	reset_generators(s0,s1);

	tanfoo();

	return 0;
	}