Syoyo Fujita syoyo

## gist:ef68a9c5b46b040e88db
The clock cycle count for "expf()" (reference) is 141645.

The clock cycle count for "expapprox()" is 74.

The clock cycle count for "expapprox4()" is 127 (/4 = 31).

// GCC
#define RESTRICT __restrict__

// Disable range check makes faster evaluation of exp().

## gist:9484d27be95e3303789b
// code From http://gallium.inria.fr/blog/fast-vectorizable-math-approx/

$ e-gcc compile options: -O3 -mno-soft-cmpsf -mcmove -mfp-mode=truncate

// 73 clocks
00000f40 <_expapprox>:
     f40:       200b 0002       mov r1,0x0
     f44:       476b 0aa2       mov r2,0xaa3b
     f48:       470b 14b2       movt r2,0x4b38
     f4c:       2fcb 14e2       movt r1,0x4e7e

## gist:1516aa3e8e5489871fdd
#include <cmath>
#include <emmintrin.h>

#define _mm_set1_pd(x) _mm_set_pd((x), (x))

#define FORCE_INLINE    __attriabute__((force_inline))

// Compute exp(x) using trigonometric function.
inline __m128d exp_tri(__m128d v)
{

## gist:861c3d78ffc6ac5aabd0
_ZN5fmath3expEd:
.LLFB2:
.L46:
.LSSN28:

/*    167 */	add	%sp,-208,%sp


.L47:
.LSSN29:

## gist:78f6ddb6f99deb49f8ae

# K frontend
$ FCCpx -Kfast,nounroll,noswp exp_bench.cpp fmath.cpp
exp_bench.cpp:
fmath.cpp:
"fmath.cpp", line 30: warning: variable "fmath::local::LOG_TABLE_SIZE" was declared but never referenced
  const size_t LOG_TABLE_SIZE = 12;
               ^


## gist:a8ba3b6fb1f2d51e4fff
L21:
.LSSN2:

/*     12 */    frcpad  %f0,%f32

/*     12 */    sethi   %h44(.LR0),%g1

/*     12 */    or      %g1,%m44(.LR0),%g1

/*     12 */    sllx    %g1,12,%g1

## gist:d9fb3092860614861efb
PRE_ALIGN(128) struct __vec8_d {
    __vec8_d() { }

    FORCEINLINE __vec8_d(const double v0) {
        u.v0 = _mm_set_pd(v0, v0);
        u.v1 = _mm_set_pd(v0, v0);
        u.v2 = _mm_set_pd(v0, v0);
        u.v3 = _mm_set_pd(v0, v0);
    }

## gist:d80cb6f9936aa5f290da
#include <cstdio>
#include <cmath>
#include <emmintrin.h>

#define _mm_set1_pd(x) _mm_set_pd((x), (x))

// Probably somewhat faster.
inline __m128d fastexp(__m128d v)
{
  const __m128d inv_log2 = _mm_set1_pd(1.4426950408889634073599);

## gist:2f89e50edd74d4179d03
// Based on glsl-sse2
inline __m128d mylog2(const __m128d v) {
    int ibuf[4];
    __m128d o = _mm_set_pd(1.0, 1.0);
    __m128i infVal = _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000);
    __m128d c = *(reinterpret_cast<__m128d*>(&infVal));
    __m128d f = _mm_sub_pd(_mm_or_pd(_mm_andnot_pd(c, v),
                                     _mm_and_pd(c, o)), o);
    //const __m128i iVal = *(reinterpret_cast<const __m128i*>(&v));
    //__m128i a = _mm_sub_epi32(_mm_srli_epi32(iVal, 20),

## gist:07dc264f4c5952a456be
#include <cstdio>
#include <cmath>
#include <emmintrin.h>

#define _mm_set1_pd(x) _mm_set_pd((x), (x))


// Based on http://www.chokkan.org/blog/archives/340
inline __m128d myexp(__m128d v)
{
	The clock cycle count for "expf()" (reference) is 141645.

	The clock cycle count for "expapprox()" is 74.

	The clock cycle count for "expapprox4()" is 127 (/4 = 31).

	// GCC
	#define RESTRICT __restrict__

	// Disable range check makes faster evaluation of exp().
	// code From http://gallium.inria.fr/blog/fast-vectorizable-math-approx/

	$ e-gcc compile options: -O3 -mno-soft-cmpsf -mcmove -mfp-mode=truncate

	// 73 clocks
	00000f40 <_expapprox>:
	f40: 200b 0002 mov r1,0x0
	f44: 476b 0aa2 mov r2,0xaa3b
	f48: 470b 14b2 movt r2,0x4b38
	f4c: 2fcb 14e2 movt r1,0x4e7e
	#include <cmath>
	#include <emmintrin.h>

	#define _mm_set1_pd(x) _mm_set_pd((x), (x))

	#define FORCE_INLINE __attriabute__((force_inline))

	// Compute exp(x) using trigonometric function.
	inline __m128d exp_tri(__m128d v)
	{
	_ZN5fmath3expEd:
	.LLFB2:
	.L46:
	.LSSN28:

	/* 167 */ add %sp,-208,%sp


	.L47:
	.LSSN29:

	# K frontend
	$ FCCpx -Kfast,nounroll,noswp exp_bench.cpp fmath.cpp
	exp_bench.cpp:
	fmath.cpp:
	"fmath.cpp", line 30: warning: variable "fmath::local::LOG_TABLE_SIZE" was declared but never referenced
	const size_t LOG_TABLE_SIZE = 12;
	^
	L21:
	.LSSN2:

	/* 12 */ frcpad %f0,%f32

	/* 12 */ sethi %h44(.LR0),%g1

	/* 12 */ or %g1,%m44(.LR0),%g1

	/* 12 */ sllx %g1,12,%g1
	PRE_ALIGN(128) struct __vec8_d {
	__vec8_d() { }

	FORCEINLINE __vec8_d(const double v0) {
	u.v0 = _mm_set_pd(v0, v0);
	u.v1 = _mm_set_pd(v0, v0);
	u.v2 = _mm_set_pd(v0, v0);
	u.v3 = _mm_set_pd(v0, v0);
	}
	#include <cstdio>
	#include <cmath>
	#include <emmintrin.h>

	#define _mm_set1_pd(x) _mm_set_pd((x), (x))

	// Probably somewhat faster.
	inline __m128d fastexp(__m128d v)
	{
	const __m128d inv_log2 = _mm_set1_pd(1.4426950408889634073599);
	// Based on glsl-sse2
	inline __m128d mylog2(const __m128d v) {
	int ibuf[4];
	__m128d o = _mm_set_pd(1.0, 1.0);
	__m128i infVal = _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000);
	__m128d c = (reinterpret_cast<__m128d>(&infVal));
	__m128d f = _mm_sub_pd(_mm_or_pd(_mm_andnot_pd(c, v),
	_mm_and_pd(c, o)), o);
	//const __m128i iVal = (reinterpret_cast<const __m128i>(&v));
	//__m128i a = _mm_sub_epi32(_mm_srli_epi32(iVal, 20),