Skip to content

Instantly share code, notes, and snippets.

@syoyo
Created June 12, 2015 04:24
Show Gist options
  • Save syoyo/ef68a9c5b46b040e88db to your computer and use it in GitHub Desktop.
Save syoyo/ef68a9c5b46b040e88db to your computer and use it in GitHub Desktop.
exp() approximate function on Epiphany.
The clock cycle count for "expf()" (reference) is 141645.
The clock cycle count for "expapprox()" is 74.
The clock cycle count for "expapprox4()" is 127 (/4 = 31).
// GCC
#define RESTRICT __restrict__
// Disable range check makes faster evaluation of exp().
#define FMATH_EXP_DISABLE_RANGE_CHECK (0)
// Based on http://gallium.inria.fr/blog/fast-vectorizable-math-approx/
/* Relative error bounded by 1e-5 for normalized outputs
Returns invalid outputs for nan inputs */
float expapprox(float val) {
/* Workaround a lack of optimization in gcc */
const float exp_cst1 = 2139095040.f;
const float exp_cst2 = 0.f;
union { int i; float f; } xu, xu2;
float val2, val3, val4, b;
int val4i;
val2 = 12102203.1615614f*val+1065353216.f;
#if FMATH_EXP_DISABLE_RANGE_CHECK
val4 = val2;
#else
val3 = val2 < exp_cst1 ? val2 : exp_cst1;
val4 = val3 > exp_cst2 ? val3 : exp_cst2;
#endif
val4i = (int) val4;
xu.i = val4i & 0x7F800000;
xu2.i = (val4i & 0x7FFFFF) | 0x3F800000;
b = xu2.f;
return
xu.f * (0.509964287281036376953125f + b *
(0.3120158612728118896484375f + b *
(0.1666135489940643310546875f + b *
(-2.12528370320796966552734375e-3f + b *
1.3534179888665676116943359375e-2f))));
}
void expapprox4(float* __restrict__ dst, const float* __restrict__ src) {
// Manual code expansion of exparrpox() x 4.
/* Workaround a lack of optimization in gcc */
const float exp_cst1 = 2139095040.f;
const float exp_cst2 = 0.f;
const float kCoeff[5] = {0.509964287281036376953125f, 0.3120158612728118896484375f, 0.1666135489940643310546875f,
-2.12528370320796966552734375e-3f, 1.3534179888665676116943359375e-2f};
union { int i; float f; } xu_0, xu_1, xu_2, xu_3, xu2_0, xu2_1, xu2_2, xu2_3;
float val2_0, val2_1, val2_2, val2_3;
float val3_0, val3_1, val3_2, val3_3;
float val4_0, val4_1, val4_2, val4_3;
float b0, b1, b2, b3;
int val4i_0, val4i_1, val4i_2, val4i_3;
val2_0 = 12102203.1615614f*src[0]+1065353216.f;
val2_1 = 12102203.1615614f*src[1]+1065353216.f;
val2_2 = 12102203.1615614f*src[2]+1065353216.f;
val2_3 = 12102203.1615614f*src[3]+1065353216.f;
#if FMATH_EXP_DISABLE_RANGE_CHECK
val4_0 = val2_0;
val4_1 = val2_1;
val4_2 = val2_2;
val4_3 = val2_3;
#else
val3_0 = val2_0 < exp_cst1 ? val2_0 : exp_cst1;
val3_1 = val2_1 < exp_cst1 ? val2_1 : exp_cst1;
val3_2 = val2_2 < exp_cst1 ? val2_2 : exp_cst1;
val3_3 = val2_3 < exp_cst1 ? val2_3 : exp_cst1;
val4_0 = val3_0 > exp_cst2 ? val3_0 : exp_cst2;
val4_1 = val3_1 > exp_cst2 ? val3_1 : exp_cst2;
val4_2 = val3_2 > exp_cst2 ? val3_2 : exp_cst2;
val4_3 = val3_3 > exp_cst2 ? val3_3 : exp_cst2;
#endif
val4i_0 = (int)val4_0;
val4i_1 = (int)val4_1;
val4i_2 = (int)val4_2;
val4i_3 = (int)val4_3;
val4i_0 = (int)val4_0;
val4i_1 = (int)val4_1;
val4i_2 = (int)val4_2;
val4i_3 = (int)val4_3;
xu_0.i = val4i_0 & 0x7F800000;
xu_1.i = val4i_1 & 0x7F800000;
xu_2.i = val4i_2 & 0x7F800000;
xu_3.i = val4i_3 & 0x7F800000;
xu2_0.i = (val4i_0 & 0x7FFFFF) | 0x3F800000;
xu2_1.i = (val4i_1 & 0x7FFFFF) | 0x3F800000;
xu2_2.i = (val4i_2 & 0x7FFFFF) | 0x3F800000;
xu2_3.i = (val4i_3 & 0x7FFFFF) | 0x3F800000;
b0 = xu2_0.f;
b1 = xu2_1.f;
b2 = xu2_2.f;
b3 = xu2_3.f;
const float c0 = kCoeff[0];
const float c1 = kCoeff[1];
const float c2 = kCoeff[2];
const float c3 = kCoeff[3];
const float c4 = kCoeff[4];
dst[0] = xu_0.f * (c0+ b0 * (c1 + b0 * (c2 + b0 * (c3 + b0 * c4))));
dst[1] = xu_1.f * (c0+ b1 * (c1 + b1 * (c2 + b1 * (c3 + b1 * c4))));
dst[2] = xu_2.f * (c0+ b2 * (c1 + b2 * (c2 + b2 * (c3 + b2 * c4))));
dst[3] = xu_3.f * (c0+ b3 * (c1 + b3 * (c2 + b3 * (c3 + b3 * c4))));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment