Skip to content

Instantly share code, notes, and snippets.

View syoyo's full-sized avatar
💗
ray tracing

Syoyo Fujita syoyo

💗
ray tracing
View GitHub Profile
@syoyo
syoyo / gist:ef68a9c5b46b040e88db
Created June 12, 2015 04:24
exp() approximate function on Epiphany.
The clock cycle count for "expf()" (reference) is 141645.
The clock cycle count for "expapprox()" is 74.
The clock cycle count for "expapprox4()" is 127 (/4 = 31).
// GCC
#define RESTRICT __restrict__
// Disable range check makes faster evaluation of exp().
@syoyo
syoyo / gist:9484d27be95e3303789b
Created June 8, 2015 14:15
expapprox() compiled for Parallella Epiphany
// code From http://gallium.inria.fr/blog/fast-vectorizable-math-approx/
$ e-gcc compile options: -O3 -mno-soft-cmpsf -mcmove -mfp-mode=truncate
// 73 clocks
00000f40 <_expapprox>:
f40: 200b 0002 mov r1,0x0
f44: 476b 0aa2 mov r2,0xaa3b
f48: 470b 14b2 movt r2,0x4b38
f4c: 2fcb 14e2 movt r1,0x4e7e
@syoyo
syoyo / gist:1516aa3e8e5489871fdd
Created May 8, 2015 06:59
doubl2 exp(double2) performance on SPARC/HPC-ACE
#include <cmath>
#include <emmintrin.h>
#define _mm_set1_pd(x) _mm_set_pd((x), (x))
#define FORCE_INLINE __attriabute__((force_inline))
// Compute exp(x) using trigonometric function.
inline __m128d exp_tri(__m128d v)
{
@syoyo
syoyo / gist:861c3d78ffc6ac5aabd0
Created May 7, 2015 10:03
disassemble of double fmath::exp(double) in SPARC/HPC-ACE
_ZN5fmath3expEd:
.LLFB2:
.L46:
.LSSN28:
/* 167 */ add %sp,-208,%sp
.L47:
.LSSN29:
@syoyo
syoyo / gist:78f6ddb6f99deb49f8ae
Created May 6, 2015 15:41
fmath performance(2015/05/06)
# K frontend
$ FCCpx -Kfast,nounroll,noswp exp_bench.cpp fmath.cpp
exp_bench.cpp:
fmath.cpp:
"fmath.cpp", line 30: warning: variable "fmath::local::LOG_TABLE_SIZE" was declared but never referenced
const size_t LOG_TABLE_SIZE = 12;
^
@syoyo
syoyo / gist:a8ba3b6fb1f2d51e4fff
Created May 4, 2015 16:15
1/x with -Kfast,nounroll,noswp
L21:
.LSSN2:
/* 12 */ frcpad %f0,%f32
/* 12 */ sethi %h44(.LR0),%g1
/* 12 */ or %g1,%m44(.LR0),%g1
/* 12 */ sllx %g1,12,%g1
PRE_ALIGN(128) struct __vec8_d {
__vec8_d() { }
FORCEINLINE __vec8_d(const double v0) {
u.v0 = _mm_set_pd(v0, v0);
u.v1 = _mm_set_pd(v0, v0);
u.v2 = _mm_set_pd(v0, v0);
u.v3 = _mm_set_pd(v0, v0);
}
@syoyo
syoyo / gist:d80cb6f9936aa5f290da
Created April 18, 2015 16:45
exp(x) using trigonometric function in HPC-ACE
#include <cstdio>
#include <cmath>
#include <emmintrin.h>
#define _mm_set1_pd(x) _mm_set_pd((x), (x))
// Probably somewhat faster.
inline __m128d fastexp(__m128d v)
{
const __m128d inv_log2 = _mm_set1_pd(1.4426950408889634073599);
@syoyo
syoyo / gist:2f89e50edd74d4179d03
Created April 4, 2015 16:39
SIMD log2() approximate function in HPC-ACE(not yet optimised)
// Based on glsl-sse2
inline __m128d mylog2(const __m128d v) {
int ibuf[4];
__m128d o = _mm_set_pd(1.0, 1.0);
__m128i infVal = _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000);
__m128d c = *(reinterpret_cast<__m128d*>(&infVal));
__m128d f = _mm_sub_pd(_mm_or_pd(_mm_andnot_pd(c, v),
_mm_and_pd(c, o)), o);
//const __m128i iVal = *(reinterpret_cast<const __m128i*>(&v));
//__m128i a = _mm_sub_epi32(_mm_srli_epi32(iVal, 20),
@syoyo
syoyo / gist:07dc264f4c5952a456be
Created April 4, 2015 15:48
exp() approximation in HPC-ACE(const value loading is not yet optimised)
#include <cstdio>
#include <cmath>
#include <emmintrin.h>
#define _mm_set1_pd(x) _mm_set_pd((x), (x))
// Based on http://www.chokkan.org/blog/archives/340
inline __m128d myexp(__m128d v)
{