Created
April 4, 2015 15:48
-
-
Save syoyo/07dc264f4c5952a456be to your computer and use it in GitHub Desktop.
exp() approximation in HPC-ACE(const value loading is not yet optimised)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cmath> | |
#include <emmintrin.h> | |
#define _mm_set1_pd(x) _mm_set_pd((x), (x)) | |
// Based on http://www.chokkan.org/blog/archives/340 | |
inline __m128d myexp(__m128d v) | |
{ | |
__m128d one = _mm_set1_pd(1.0); | |
__m128d log2e = _mm_set1_pd(1.4426950408889634073599); | |
__m128d c1 = _mm_set1_pd(6.93145751953125E-1); | |
__m128d c2 = _mm_set1_pd(1.42860682030941723212E-6); | |
__m128d w5 = _mm_set1_pd(1.185268231308989403584147407056378360798378534739e-2); | |
__m128d w4 = _mm_set1_pd(3.87412011356070379615759057344100690905653320886699e-2); | |
__m128d w3 = _mm_set1_pd(0.16775408658617866431779970932853611481292418818223); | |
__m128d w2 = _mm_set1_pd(0.49981934577169208735732248650232562589934399402426); | |
__m128d w1 = _mm_set1_pd(1.00001092396453942157124178508842412412025643386873); | |
__m128d w0 = _mm_set1_pd(0.99999989311082729779536722205742989232069120354073); | |
__m128d k1d; | |
__m128d p1; | |
__m128d p1c; | |
__m128d p1x; | |
__m128d a1; | |
__m128d xmm0, xmm1; | |
__m128d x1; | |
x1 = v; | |
/* a = x / log2; */ | |
xmm0 = log2e; | |
xmm1 = _mm_setzero_pd(); | |
a1 = _mm_mul_pd(x1, xmm0); | |
/* k = (int)floor(a); p = (float)k; */ | |
p1 = _mm_cmplt_pd(a1, xmm1); | |
xmm0 = one; | |
p1 = _mm_and_pd(p1, xmm0); | |
a1 = _mm_sub_pd(a1, p1); | |
k1d = _fjsp_dtox_v2r8(a1); | |
p1 = _fjsp_xtod_v2r8(k1d); // p1 = (double)((int)a1). | |
p1c = _mm_add_pd(p1, _mm_set1_pd(1023.0)); | |
// We want to do | |
// | |
// p1x = ((int)a1 + 1023) << (20 + 32) | |
// | |
// but it looks like HPC-ACE does not support shift left amount larger than 32bit, | |
// thus multiply 2**32 here to mimic shit left by 32bit. | |
p1c = _mm_mul_pd(p1c, _mm_set1_pd(4294967296.0)); | |
p1x = _fjsp_dtox_v2r8(p1c); | |
__m128i p1d = *(reinterpret_cast<__m128i*>(&p1x)); | |
/* x -= p * log2; */ | |
xmm0 = c1; | |
xmm1 = c2; | |
a1 = _mm_mul_pd(p1, xmm0); | |
x1 = _mm_sub_pd(x1, a1); | |
a1 = _mm_mul_pd(p1, xmm1); | |
x1 = _mm_sub_pd(x1, a1); | |
/* Compute e^x using a polynomial approximation. */ | |
xmm0 = w5; | |
xmm1 = w4; | |
a1 = _mm_mul_pd(x1, xmm0); | |
a1 = _mm_add_pd(a1, xmm1); | |
xmm0 = w3; | |
xmm1 = w2; | |
a1 = _mm_mul_pd(a1, x1); | |
a1 = _mm_add_pd(a1, xmm0); | |
a1 = _mm_mul_pd(a1, x1); | |
a1 = _mm_add_pd(a1, xmm1); | |
xmm0 = w1; | |
xmm1 = w0; | |
a1 = _mm_mul_pd(a1, x1); | |
a1 = _mm_add_pd(a1, xmm0); | |
a1 = _mm_mul_pd(a1, x1); | |
a1 = _mm_add_pd(a1, xmm1); | |
/* p = 2^k; */ | |
// reference in scalar code. | |
//double dtmp[2]; | |
//_mm_store_pd(dtmp, p1c); | |
//typedef union { | |
// double d; | |
// unsigned short s[4]; | |
//} ieee754; | |
//ieee754 u0, u1; | |
//u0.d = 0.0; | |
//u1.d = 0.0; | |
//// big endian: [0]. little endian: [3] | |
//u0.s[0] = (unsigned short)(((int)dtmp[0] << 4) & 0x7FF0); | |
//u1.s[0] = (unsigned short)(((int)dtmp[1] << 4) & 0x7FF0); | |
////p1 = _mm_set_pd(u0.d, u1.d); | |
//printf("ref: %d, %f\n", u0.s[0], u0.d); | |
// left shift 4 bit + 16 bit. | |
__m128i p1u = _mm_slli_epi64(p1d, 20); | |
// mas with 0x7FF0000000000000LL | |
__m128i mask = _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000); | |
__m128i p2u = _mm_and_si128(p1u, mask); | |
p1 = *(reinterpret_cast<__m128d*>(&p2u)); | |
/* a *= 2^k. */ | |
a1 = _mm_mul_pd(a1, p1); | |
return a1; | |
} | |
int | |
main(int argc, char** argv) | |
{ | |
__m128d a = _mm_set_pd((double)argc, (double)argc); | |
__m128d bb = myexp(a); | |
double ref = exp((double)argc); | |
double ret[2]; | |
_mm_store_pd(ret, bb); | |
printf("[SIMD] double = %f, %f\n", ret[0], ret[1]); | |
printf("[REF] double = %f\n", ref); | |
printf("[DIFF] double = %f, %f\n", ref - ret[0], ref - ret[1]); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment