Skip to content

Instantly share code, notes, and snippets.

@Marc-B-Reynolds
Created May 10, 2020 07:24
Show Gist options
  • Save Marc-B-Reynolds/707db8df879e5e34ea21d3c13b782221 to your computer and use it in GitHub Desktop.
Save Marc-B-Reynolds/707db8df879e5e34ea21d3c13b782221 to your computer and use it in GitHub Desktop.
hacky scalar branch-free tanpi (not intended for use)
// Public Domain under http://unlicense.org, see link for details.
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include "xmmintrin.h"
#include "wmmintrin.h"
#include "x86intrin.h"
#define RANDOMIZE
// histogram size
#define HLEN 127
#define TLEN 0xFFFFFF
// external code: xoroshiro128+
inline uint32_t f32_to_bits(float x)
{
uint32_t u; memcpy(&u,&x,4); return u;
}
inline float f32_from_bits(uint32_t x)
{
float u; memcpy(&u,&x,4); return u;
}
inline uint32_t u32_abs(uint32_t x)
{
return (int32_t)x >= 0 ? x : -x;
}
inline uint32_t f32_ulp_dist(float a, float b)
{
uint32_t ua = f32_to_bits(a);
uint32_t ub = f32_to_bits(b);
uint32_t s = ub^ua;
if ((int32_t)s >= 0)
return u32_abs(ua-ub);
return ua+ub+0x80000000;
}
//*******************
uint64_t rng_state[2];
inline uint64_t rotl(const uint64_t v, int i)
{
return (v << i)|(v >> (64-i));
}
inline uint64_t rng_u64(void)
{
uint64_t s0 = rng_state[0];
uint64_t s1 = rng_state[1];
uint64_t r = s0 + s1;
s1 ^= s0;
rng_state[0] = rotl(s0,55) ^ s1 ^ (s1<<14);
rng_state[1] = rotl(s1,36);
return r;
}
// end: xoroshiro128+
// double on [0,1)
inline double rng_f64(void)
{
return (double)(rng_u64() >> (64-53))*0x1p-53;
}
inline float rng_f32(void)
{
return (float)(rng_u64() >> (64-24))*0x1p-24f;
}
void reset_generators(uint64_t s0, uint64_t s1)
{
rng_state[0] = s0;
rng_state[1] = s1;
rng_u64();
}
typedef struct {float a,b; } f32_pair_t;
inline float f32_xor(float a, float b) { return f32_from_bits(f32_to_bits(a) ^ f32_to_bits(b)); }
inline float f32_mulsign(float v, uint32_t s) { return f32_from_bits(f32_to_bits(v)^s); }
inline float f32_mask_select(uint32_t m, float a, float b)
{
return f32_from_bits((f32_to_bits(a) & m)|(f32_to_bits(b) & ~m));
}
float f32_sin_pi_3_k[] = {0x1.91f5aap1f, -0x1.408cf2p2f};
float f32_sin_pi_5_k[] = {0x1.921f8ep1f, -0x1.4aa618p2f, 0x1.3f3f3cp1f};
float f32_sin_pi_7_k[] = {0x1.921fb6p1f, -0x1.4abbecp2f, 0x1.466b2p1f, -0x1.2f5992p-1f};
float f32_cos_pi_4_k[] = {0x1.fffe74p-1f, -0x1.3ba0ecp2f, 0x1.f73ff8p1f};
float f32_cos_pi_6_k[] = {0x1.fffffep-1f, -0x1.3bd37ep2f, 0x1.03acccp2f, -0x1.4dfd3ap0f};
float f32_cos_pi_8_k[] = {0x1p0f, -0x1.3bd3ccp2f, 0x1.03c1b8p2f, -0x1.55b7cep0f, 0x1.d684aap-3f};
// tan(pi x)
// spitball of max relative error 2.38419*10^-7 (2 ulp)
float f32_tanpi(float a)
{
static const float* S = f32_sin_pi_7_k;
static const float* C = f32_cos_pi_6_k;
float c,s,a2,a3,r;
uint32_t q,sgn;
r = rintf(a+a);
a = fmaf(r,-0.5f,a);
q = (uint32_t)r; q = -(q&1);
sgn = q << 31;
a2 = a*a;
c = fmaf(C[3], a2, C[2]); s = fmaf(S[3], a2, S[2]); a3 = a2*a;
c = fmaf(c, a2, C[1]); s = fmaf(s, a2, S[1]);
c = fmaf(c, a2, C[0]); s = a3 * s;
c = f32_mulsign(c,sgn); s = fmaf(a, S[0], s);
float n = f32_mask_select(q,c,s);
float d = f32_mask_select(q,s,c);
return n/d;
}
// tan(pi x)
// spitball of max relative error ~0.0000207424 (174 ulp)
float f32_tanpi_r(float a)
{
static const float* S = f32_sin_pi_5_k;
static const float* C = f32_cos_pi_4_k;
float c,s,a2,a3,r;
uint32_t q,sgn;
r = rintf(a+a);
a = fmaf(r,-0.5f,a);
q = (uint32_t)r;
sgn = q << 31; q = -(q&1);
a2 = a*a;
c = fmaf(C[2], a2, C[1]); s = fmaf(S[2], a2, S[1]); a3 = a2*a;
c = fmaf(c, a2, C[0]); s = a3 * s;
c = f32_mulsign(c,sgn); s = fmaf(a, S[0], s);
float n = f32_mask_select(q,c,s);
float d = f32_mask_select(q,s,c);
return n/d;
}
#define HISTO_LEN 177
#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))
void test()
{
uint32_t histo[HISTO_LEN] = {0};
uint32_t maxD = 0;
// bordering on useless perf test
#if 0
double total=0.0;
double min = 0x1p23f;
uint64_t t0;
uint64_t t1;
for(uint32_t j=0; j<32; j++) {
t0 = _rdtsc();
float x0 = rng_f32(); x0 = 2.f*x0-1.f;
float t = 0.f;
for(uint32_t i=0; i<0xffff; i++) {
float x0 = rng_f32(); x0 = 2.f*x0-1.f;
float tr = f32_tanpi_r(x0);
t += tr;
}
t1 = _rdtsc();
double dt = (double)(t1-t0)*(1.0/65536.0);
if (min > dt) min = dt;
total += dt;
printf("time: %f (%f)\n", dt, t);
}
printf(" %f %f\n", total*(1.0/32.0), min);
#endif
// questionable (to be generous) error testing
// libm tan error bound is unknown but assumes
// tan(PI*x) is correctly rounded and ignores
// the double rounding when dropping to singles
for(uint32_t i=0; i<0xffffff; i++) {
#if 1
float x0 = rng_f32(); x0 = 2.f*x0-1.f;
// grow the range a bit
//x0 *= 16.f;
#else
float x0 = 0.25f*rng_f32();
#endif
float t0,t1;
double x1 = M_PI*x0;
t1 = (float)tan(x1);
t0 = f32_tanpi(x0);
// simple instead of ss version & fixup
uint32_t diff = f32_ulp_dist(t0,t1);
#if 1
if (diff > maxD /*|| diff >= HISTO_LEN*/) {
maxD = diff;
printf("%14a (%14e)| % f % f | %14a %14a | %2u\n", x0,x0,t1,t0,t1,t0,diff);
}
#endif
diff = (diff < HISTO_LEN) ? diff : HISTO_LEN-1;
histo[diff]++;
}
printf("\ntan {");
for(uint32_t i=0; i<HISTO_LEN; i++) {
printf("%u,", histo[i]);
}
printf("}\n");
}
int main(void)
{
uint64_t s0;
uint64_t s1;
#ifdef RANDOMIZE
s0 = _rdtsc();
#else
s0 = 0x77535345;
#endif
s1 = 0x1234567;
reset_generators(s0,s1);
test();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment