Created
January 16, 2016 13:35
-
-
Save LebedevRI/15f76f2368c984c7eeae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out | |
*/ | |
#include <stdio.h> // for printf | |
#include <string.h> | |
#include <math.h> | |
#include <xmmintrin.h> // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps | |
#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) | |
#define _DATAPOINTS 1e9 | |
#define STOPS ((size_t)(round(_DATAPOINTS))) | |
/* | |
* interpolate values from p1 and p2 into out. | |
*/ | |
static float interpolate(const float min, // the smaller tuning | |
const float max, // the larger tuning (can't be == p1) | |
const size_t i) | |
{ | |
// stupid linear interpolation. | |
// to be confirmed. | |
const double t = (((double)i) / ((double)(STOPS-1))); | |
return (1.0 - t) * min + t * max; | |
} | |
static inline float cbrtf_fast(const float x) | |
{ | |
union convert | |
{ | |
float f; | |
int i; | |
} data, dataout; | |
data.f = x; | |
// approximate cbrtf(x): | |
dataout.i = | |
( | |
( | |
(int) | |
( | |
( | |
(float) | |
( | |
data.i | |
) | |
) | |
/ | |
3.0f | |
) | |
) | |
+ | |
709921077 | |
); | |
return dataout.f; | |
} | |
static inline __m128 cbrtf_sse(const __m128 x) | |
{ | |
// approximate cbrtf(x): | |
return _mm_castsi128_ps( | |
_mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))), | |
_mm_set1_epi32(709921077))); | |
} | |
int main() | |
{ | |
const float min = -1.0f; | |
const float max = 2.0f; | |
printf("STOPS = %zu\n\n", STOPS); | |
// go through all x | |
#ifdef _OPENMP | |
#pragma omp parallel for simd schedule(static) | |
#endif | |
for(size_t i = 0; i < (STOPS); i++) | |
{ | |
const float x = interpolate(min, max, i); | |
const __m128 X = _mm_set1_ps(x); | |
const __m128 c_SSE = cbrtf_sse(X); | |
float c_float[4]; | |
_mm_store_ps(c_float, c_SSE); | |
const float c = cbrtf_fast(x); | |
if(c != c_float[0]) printf("%f: %f != %f, diff = %e\n", x, c, c_float[0], fmax((double)c, (double)c_float[0]) - fmin((double)c, (double)c_float[0])); | |
//printf("\tref = %f, diff(sse) = %e, diff(plain) = %e\n", cbrtf(x), fmax((double)cbrtf(x), (double)c_float[0]) - fmin((double)cbrtf(x), (double)c_float[0]), fmax((double)cbrtf(x), (double)c) - fmin((double)cbrtf(x), (double)c)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment