Created
June 12, 2015 04:24
-
-
Save syoyo/ef68a9c5b46b040e88db to your computer and use it in GitHub Desktop.
exp() approximate function on Epiphany.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The clock cycle count for "expf()" (reference) is 141645. | |
The clock cycle count for "expapprox()" is 74. | |
The clock cycle count for "expapprox4()" is 127 (/4 = 31). | |
// GCC | |
#define RESTRICT __restrict__ | |
// Disable range check makes faster evaluation of exp(). | |
#define FMATH_EXP_DISABLE_RANGE_CHECK (0) | |
// Based on http://gallium.inria.fr/blog/fast-vectorizable-math-approx/ | |
/* Relative error bounded by 1e-5 for normalized outputs | |
Returns invalid outputs for nan inputs */ | |
float expapprox(float val) { | |
/* Workaround a lack of optimization in gcc */ | |
const float exp_cst1 = 2139095040.f; | |
const float exp_cst2 = 0.f; | |
union { int i; float f; } xu, xu2; | |
float val2, val3, val4, b; | |
int val4i; | |
val2 = 12102203.1615614f*val+1065353216.f; | |
#if FMATH_EXP_DISABLE_RANGE_CHECK | |
val4 = val2; | |
#else | |
val3 = val2 < exp_cst1 ? val2 : exp_cst1; | |
val4 = val3 > exp_cst2 ? val3 : exp_cst2; | |
#endif | |
val4i = (int) val4; | |
xu.i = val4i & 0x7F800000; | |
xu2.i = (val4i & 0x7FFFFF) | 0x3F800000; | |
b = xu2.f; | |
return | |
xu.f * (0.509964287281036376953125f + b * | |
(0.3120158612728118896484375f + b * | |
(0.1666135489940643310546875f + b * | |
(-2.12528370320796966552734375e-3f + b * | |
1.3534179888665676116943359375e-2f)))); | |
} | |
void expapprox4(float* __restrict__ dst, const float* __restrict__ src) { | |
// Manual code expansion of exparrpox() x 4. | |
/* Workaround a lack of optimization in gcc */ | |
const float exp_cst1 = 2139095040.f; | |
const float exp_cst2 = 0.f; | |
const float kCoeff[5] = {0.509964287281036376953125f, 0.3120158612728118896484375f, 0.1666135489940643310546875f, | |
-2.12528370320796966552734375e-3f, 1.3534179888665676116943359375e-2f}; | |
union { int i; float f; } xu_0, xu_1, xu_2, xu_3, xu2_0, xu2_1, xu2_2, xu2_3; | |
float val2_0, val2_1, val2_2, val2_3; | |
float val3_0, val3_1, val3_2, val3_3; | |
float val4_0, val4_1, val4_2, val4_3; | |
float b0, b1, b2, b3; | |
int val4i_0, val4i_1, val4i_2, val4i_3; | |
val2_0 = 12102203.1615614f*src[0]+1065353216.f; | |
val2_1 = 12102203.1615614f*src[1]+1065353216.f; | |
val2_2 = 12102203.1615614f*src[2]+1065353216.f; | |
val2_3 = 12102203.1615614f*src[3]+1065353216.f; | |
#if FMATH_EXP_DISABLE_RANGE_CHECK | |
val4_0 = val2_0; | |
val4_1 = val2_1; | |
val4_2 = val2_2; | |
val4_3 = val2_3; | |
#else | |
val3_0 = val2_0 < exp_cst1 ? val2_0 : exp_cst1; | |
val3_1 = val2_1 < exp_cst1 ? val2_1 : exp_cst1; | |
val3_2 = val2_2 < exp_cst1 ? val2_2 : exp_cst1; | |
val3_3 = val2_3 < exp_cst1 ? val2_3 : exp_cst1; | |
val4_0 = val3_0 > exp_cst2 ? val3_0 : exp_cst2; | |
val4_1 = val3_1 > exp_cst2 ? val3_1 : exp_cst2; | |
val4_2 = val3_2 > exp_cst2 ? val3_2 : exp_cst2; | |
val4_3 = val3_3 > exp_cst2 ? val3_3 : exp_cst2; | |
#endif | |
val4i_0 = (int)val4_0; | |
val4i_1 = (int)val4_1; | |
val4i_2 = (int)val4_2; | |
val4i_3 = (int)val4_3; | |
val4i_0 = (int)val4_0; | |
val4i_1 = (int)val4_1; | |
val4i_2 = (int)val4_2; | |
val4i_3 = (int)val4_3; | |
xu_0.i = val4i_0 & 0x7F800000; | |
xu_1.i = val4i_1 & 0x7F800000; | |
xu_2.i = val4i_2 & 0x7F800000; | |
xu_3.i = val4i_3 & 0x7F800000; | |
xu2_0.i = (val4i_0 & 0x7FFFFF) | 0x3F800000; | |
xu2_1.i = (val4i_1 & 0x7FFFFF) | 0x3F800000; | |
xu2_2.i = (val4i_2 & 0x7FFFFF) | 0x3F800000; | |
xu2_3.i = (val4i_3 & 0x7FFFFF) | 0x3F800000; | |
b0 = xu2_0.f; | |
b1 = xu2_1.f; | |
b2 = xu2_2.f; | |
b3 = xu2_3.f; | |
const float c0 = kCoeff[0]; | |
const float c1 = kCoeff[1]; | |
const float c2 = kCoeff[2]; | |
const float c3 = kCoeff[3]; | |
const float c4 = kCoeff[4]; | |
dst[0] = xu_0.f * (c0+ b0 * (c1 + b0 * (c2 + b0 * (c3 + b0 * c4)))); | |
dst[1] = xu_1.f * (c0+ b1 * (c1 + b1 * (c2 + b1 * (c3 + b1 * c4)))); | |
dst[2] = xu_2.f * (c0+ b2 * (c1 + b2 * (c2 + b2 * (c3 + b2 * c4)))); | |
dst[3] = xu_3.f * (c0+ b3 * (c1 + b3 * (c2 + b3 * (c3 + b3 * c4)))); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment