This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The clock cycle count for "expf()" (reference) is 141645. | |
The clock cycle count for "expapprox()" is 74. | |
The clock cycle count for "expapprox4()" is 127 (/4 = 31). | |
// GCC | |
#define RESTRICT __restrict__ | |
// Disable range check makes faster evaluation of exp(). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// code From http://gallium.inria.fr/blog/fast-vectorizable-math-approx/ | |
$ e-gcc compile options: -O3 -mno-soft-cmpsf -mcmove -mfp-mode=truncate | |
// 73 clocks | |
00000f40 <_expapprox>: | |
f40: 200b 0002 mov r1,0x0 | |
f44: 476b 0aa2 mov r2,0xaa3b | |
f48: 470b 14b2 movt r2,0x4b38 | |
f4c: 2fcb 14e2 movt r1,0x4e7e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cmath> | |
#include <emmintrin.h> | |
#define _mm_set1_pd(x) _mm_set_pd((x), (x)) | |
#define FORCE_INLINE __attriabute__((force_inline)) | |
// Compute exp(x) using trigonometric function. | |
inline __m128d exp_tri(__m128d v) | |
{ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
_ZN5fmath3expEd: | |
.LLFB2: | |
.L46: | |
.LSSN28: | |
/* 167 */ add %sp,-208,%sp | |
.L47: | |
.LSSN29: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# K frontend | |
$ FCCpx -Kfast,nounroll,noswp exp_bench.cpp fmath.cpp | |
exp_bench.cpp: | |
fmath.cpp: | |
"fmath.cpp", line 30: warning: variable "fmath::local::LOG_TABLE_SIZE" was declared but never referenced | |
const size_t LOG_TABLE_SIZE = 12; | |
^ | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
L21: | |
.LSSN2: | |
/* 12 */ frcpad %f0,%f32 | |
/* 12 */ sethi %h44(.LR0),%g1 | |
/* 12 */ or %g1,%m44(.LR0),%g1 | |
/* 12 */ sllx %g1,12,%g1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PRE_ALIGN(128) struct __vec8_d { | |
__vec8_d() { } | |
FORCEINLINE __vec8_d(const double v0) { | |
u.v0 = _mm_set_pd(v0, v0); | |
u.v1 = _mm_set_pd(v0, v0); | |
u.v2 = _mm_set_pd(v0, v0); | |
u.v3 = _mm_set_pd(v0, v0); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cmath> | |
#include <emmintrin.h> | |
#define _mm_set1_pd(x) _mm_set_pd((x), (x)) | |
// Probably somewhat faster. | |
inline __m128d fastexp(__m128d v) | |
{ | |
const __m128d inv_log2 = _mm_set1_pd(1.4426950408889634073599); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Based on glsl-sse2 | |
inline __m128d mylog2(const __m128d v) { | |
int ibuf[4]; | |
__m128d o = _mm_set_pd(1.0, 1.0); | |
__m128i infVal = _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000); | |
__m128d c = *(reinterpret_cast<__m128d*>(&infVal)); | |
__m128d f = _mm_sub_pd(_mm_or_pd(_mm_andnot_pd(c, v), | |
_mm_and_pd(c, o)), o); | |
//const __m128i iVal = *(reinterpret_cast<const __m128i*>(&v)); | |
//__m128i a = _mm_sub_epi32(_mm_srli_epi32(iVal, 20), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cmath> | |
#include <emmintrin.h> | |
#define _mm_set1_pd(x) _mm_set_pd((x), (x)) | |
// Based on http://www.chokkan.org/blog/archives/340 | |
inline __m128d myexp(__m128d v) | |
{ |