Skip to content

Instantly share code, notes, and snippets.

@syoyo
Created April 4, 2015 15:48
Show Gist options
  • Save syoyo/07dc264f4c5952a456be to your computer and use it in GitHub Desktop.
Save syoyo/07dc264f4c5952a456be to your computer and use it in GitHub Desktop.
exp() approximation in HPC-ACE(const value loading is not yet optimised)
#include <cstdio>
#include <cmath>
#include <emmintrin.h>
#define _mm_set1_pd(x) _mm_set_pd((x), (x))
// Based on http://www.chokkan.org/blog/archives/340
inline __m128d myexp(__m128d v)
{
__m128d one = _mm_set1_pd(1.0);
__m128d log2e = _mm_set1_pd(1.4426950408889634073599);
__m128d c1 = _mm_set1_pd(6.93145751953125E-1);
__m128d c2 = _mm_set1_pd(1.42860682030941723212E-6);
__m128d w5 = _mm_set1_pd(1.185268231308989403584147407056378360798378534739e-2);
__m128d w4 = _mm_set1_pd(3.87412011356070379615759057344100690905653320886699e-2);
__m128d w3 = _mm_set1_pd(0.16775408658617866431779970932853611481292418818223);
__m128d w2 = _mm_set1_pd(0.49981934577169208735732248650232562589934399402426);
__m128d w1 = _mm_set1_pd(1.00001092396453942157124178508842412412025643386873);
__m128d w0 = _mm_set1_pd(0.99999989311082729779536722205742989232069120354073);
__m128d k1d;
__m128d p1;
__m128d p1c;
__m128d p1x;
__m128d a1;
__m128d xmm0, xmm1;
__m128d x1;
x1 = v;
/* a = x / log2; */
xmm0 = log2e;
xmm1 = _mm_setzero_pd();
a1 = _mm_mul_pd(x1, xmm0);
/* k = (int)floor(a); p = (float)k; */
p1 = _mm_cmplt_pd(a1, xmm1);
xmm0 = one;
p1 = _mm_and_pd(p1, xmm0);
a1 = _mm_sub_pd(a1, p1);
k1d = _fjsp_dtox_v2r8(a1);
p1 = _fjsp_xtod_v2r8(k1d); // p1 = (double)((int)a1).
p1c = _mm_add_pd(p1, _mm_set1_pd(1023.0));
// We want to do
//
// p1x = ((int)a1 + 1023) << (20 + 32)
//
// but it looks like HPC-ACE does not support shift left amount larger than 32bit,
// thus multiply 2**32 here to mimic shit left by 32bit.
p1c = _mm_mul_pd(p1c, _mm_set1_pd(4294967296.0));
p1x = _fjsp_dtox_v2r8(p1c);
__m128i p1d = *(reinterpret_cast<__m128i*>(&p1x));
/* x -= p * log2; */
xmm0 = c1;
xmm1 = c2;
a1 = _mm_mul_pd(p1, xmm0);
x1 = _mm_sub_pd(x1, a1);
a1 = _mm_mul_pd(p1, xmm1);
x1 = _mm_sub_pd(x1, a1);
/* Compute e^x using a polynomial approximation. */
xmm0 = w5;
xmm1 = w4;
a1 = _mm_mul_pd(x1, xmm0);
a1 = _mm_add_pd(a1, xmm1);
xmm0 = w3;
xmm1 = w2;
a1 = _mm_mul_pd(a1, x1);
a1 = _mm_add_pd(a1, xmm0);
a1 = _mm_mul_pd(a1, x1);
a1 = _mm_add_pd(a1, xmm1);
xmm0 = w1;
xmm1 = w0;
a1 = _mm_mul_pd(a1, x1);
a1 = _mm_add_pd(a1, xmm0);
a1 = _mm_mul_pd(a1, x1);
a1 = _mm_add_pd(a1, xmm1);
/* p = 2^k; */
// reference in scalar code.
//double dtmp[2];
//_mm_store_pd(dtmp, p1c);
//typedef union {
// double d;
// unsigned short s[4];
//} ieee754;
//ieee754 u0, u1;
//u0.d = 0.0;
//u1.d = 0.0;
//// big endian: [0]. little endian: [3]
//u0.s[0] = (unsigned short)(((int)dtmp[0] << 4) & 0x7FF0);
//u1.s[0] = (unsigned short)(((int)dtmp[1] << 4) & 0x7FF0);
////p1 = _mm_set_pd(u0.d, u1.d);
//printf("ref: %d, %f\n", u0.s[0], u0.d);
// left shift 4 bit + 16 bit.
__m128i p1u = _mm_slli_epi64(p1d, 20);
// mas with 0x7FF0000000000000LL
__m128i mask = _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000);
__m128i p2u = _mm_and_si128(p1u, mask);
p1 = *(reinterpret_cast<__m128d*>(&p2u));
/* a *= 2^k. */
a1 = _mm_mul_pd(a1, p1);
return a1;
}
int
main(int argc, char** argv)
{
__m128d a = _mm_set_pd((double)argc, (double)argc);
__m128d bb = myexp(a);
double ref = exp((double)argc);
double ret[2];
_mm_store_pd(ret, bb);
printf("[SIMD] double = %f, %f\n", ret[0], ret[1]);
printf("[REF] double = %f\n", ref);
printf("[DIFF] double = %f, %f\n", ref - ret[0], ref - ret[1]);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment