Created
January 16, 2016 19:38
-
-
Save LebedevRI/bc8740aa44d889f2f2ea to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out | |
*/ | |
#include <stdio.h> // for printf | |
#include <string.h> | |
#include <math.h> | |
#include <xmmintrin.h> // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps | |
#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) | |
// <= 4 sec to check (1e8 - best) | |
#define _DATAPOINTS 1e8 | |
#define STOPS ((size_t)(round(cbrtf(_DATAPOINTS)))) | |
static inline __m128 lab_f_m_SSE(const __m128 x); | |
/* | |
* interpolate values from p1 and p2 into out. | |
*/ | |
static float interpolate(const float min, // the smaller tuning | |
const float max, // the larger tuning (can't be == p1) | |
const size_t i) | |
{ | |
// stupid linear interpolation. | |
// to be confirmed. | |
const double t = (((double)i) / ((double)(STOPS-1))); | |
return (1.0 - t) * min + t * max; | |
} | |
static inline float cbrtf_fast(const float x) | |
{ | |
union convert | |
{ | |
float f; | |
int i; | |
} data, dataout; | |
data.f = x; | |
// approximate cbrtf(x): | |
dataout.i = | |
( | |
( | |
(int) | |
( | |
( | |
(float) | |
( | |
data.i | |
) | |
) | |
/ | |
3.0f | |
) | |
) | |
+ | |
709921077 | |
); | |
return dataout.f; | |
} | |
static inline float lab_f_m(const float x) | |
{ | |
const float epsilon = (216.0f / 24389.0f); | |
const float kappa = (24389.0f / 27.0f); | |
const float a = cbrtf_fast(x); | |
const float a3 = a*a*a; | |
// x > epsilon | |
const float res_big = (((a) * ((x+x)+a3)) / ((a3+a3)+x)); | |
// x <= epsilon | |
const float res_small = (((kappa*x) + (16.0f)) / (116.0f)); | |
// blend results according to whether each component is > epsilon or not | |
return ((x > epsilon) ? res_big : res_small); | |
} | |
static inline __m128 dt_XYZ_to_Lab(const __m128 XYZ) | |
{ | |
const float d50_inv[] = { 1.0f / 0.9642f, 1.0f, 1.0f / 0.8249f, 0.0f }; | |
const float coef[] = { 116.0f, 500.0f, 200.0f, 0.0f }; | |
float farr[4]; | |
_mm_store_ps(farr, XYZ); | |
for(int c = 0; c < 4; c++) | |
{ | |
farr[c] = lab_f_m(d50_inv[c]*farr[c]); | |
} | |
// because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116*f[0] - 16 equal to 116*(f[0]-f[3]) | |
float sf1[4]; | |
sf1[0] = farr[1]; | |
sf1[1] = farr[0]; | |
sf1[2] = farr[1]; | |
sf1[3] = farr[3]; | |
float sf2[4]; | |
sf2[0] = farr[3]; | |
sf2[1] = farr[1]; | |
sf2[2] = farr[2]; | |
sf2[3] = farr[3]; | |
float fsub[4]; | |
for(int c = 0; c < 4; c++) | |
{ | |
fsub[c] = (sf1[c]-sf2[c])*coef[c]; | |
} | |
return _mm_set_ps(0.0f, fsub[2], fsub[1], fsub[0]); | |
} | |
//------------------------------------------------------------------------------ | |
static inline __m128 cbrtf_sse(const __m128 x) | |
{ | |
// approximate cbrtf(x): | |
return _mm_castsi128_ps( | |
_mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))), | |
_mm_set1_epi32(709921077))); | |
} | |
static inline __m128 lab_f_m_SSE(const __m128 x) | |
{ | |
const __m128 epsilon = _mm_set1_ps(216.0f / 24389.0f); | |
const __m128 kappa = _mm_set1_ps(24389.0f / 27.0f); | |
// calculate as if x > epsilon : result = cbrtf(x) | |
// approximate cbrtf(x): | |
const __m128 a = cbrtf_sse(x); | |
const __m128 a3 = _mm_mul_ps(_mm_mul_ps(a, a), a); | |
const __m128 res_big = _mm_div_ps(_mm_mul_ps(a,_mm_add_ps(a3,_mm_add_ps(x,x))),_mm_add_ps(_mm_add_ps(a3, a3), x)); | |
// calculate as if x <= epsilon : result = (kappa*x+16)/116 | |
const __m128 res_small | |
= _mm_div_ps( | |
_mm_add_ps(_mm_mul_ps(kappa, x), _mm_set1_ps(16.0f)), _mm_set1_ps(116.0f)); | |
// blend results according to whether each component is > epsilon or not | |
const __m128 mask = _mm_cmpgt_ps(x, epsilon); | |
return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small)); | |
} | |
static inline __m128 dt_XYZ_to_Lab_SSE(const __m128 XYZ) | |
{ | |
const __m128 d50_inv = _mm_set_ps(0.0f, 1.0f / 0.8249f, 1.0f, 1.0f / 0.9642f); | |
const __m128 coef = _mm_set_ps(0.0f, 200.0f, 500.0f, 116.0f); | |
const __m128 f = lab_f_m_SSE(_mm_mul_ps(XYZ, d50_inv)); | |
// because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116*f[0] - 16 equal to 116*(f[0]-f[3]) | |
return _mm_mul_ps(coef, _mm_sub_ps(_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 1, 0, 1)), | |
_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 2, 1, 3)))); | |
} | |
//------------------------------------------------------------------------------ | |
int main() | |
{ | |
const float L_min = -1.0f; | |
const float L_max = 2.0f; | |
const float a_min = -1.0f; | |
const float a_max = 2.0f; | |
const float b_min = -1.0f; | |
const float b_max = 2.0f; | |
printf("DATA POINTS = %.0e\n", _DATAPOINTS); | |
printf("STOPS = %zu\n\n", STOPS); | |
// go through all L | |
#ifdef _OPENMP | |
#pragma omp parallel for simd schedule(static) collapse(3) | |
#endif | |
for(size_t i = 0; i < (STOPS); i++) | |
{ | |
// go through all a | |
for(size_t j = 0; j < STOPS; j++) | |
{ | |
// go through all b | |
for(size_t k = 0; k < STOPS; k++) | |
{ | |
const float L = interpolate(L_min, L_max, i); | |
const float a = interpolate(a_min, a_max, j); | |
const float b = interpolate(b_min, b_max, k); | |
const __m128 Lab = _mm_set_ps(0.0f, b, a, L); | |
const __m128 XYZ_SSE = dt_XYZ_to_Lab_SSE(Lab); | |
float _xyz_SSE[4]; | |
_mm_store_ps(_xyz_SSE, XYZ_SSE); | |
const __m128 XYZ = dt_XYZ_to_Lab(Lab); | |
float _xyz[4]; | |
_mm_store_ps(_xyz, XYZ); | |
#define DIFF(x, y) (fmax((x), (y))-fmin((x), (y))) | |
if(memcmp(&XYZ, &XYZ_SSE, sizeof(__m128)) != 0) | |
{ | |
printf("XYZ : %6.1f %6.1f %6.1f %6.1f\n", L, a, b, 0.0f); | |
printf("Lab (SSE): %6.1f %6.1f %6.1f %6.1f\n", _xyz_SSE[0], _xyz_SSE[1], _xyz_SSE[2], _xyz_SSE[3]); | |
printf("Lab : %6.1f %6.1f %6.1f %6.1f\n", _xyz[0], _xyz[1], _xyz[2], _xyz[3]); | |
printf("Lab diff : %6.e %6.e %6.e %6.e\n\n", DIFF(_xyz[0], _xyz_SSE[0]), DIFF(_xyz[1], _xyz_SSE[1]), DIFF(_xyz[2], _xyz_SSE[2]), DIFF(_xyz[3], _xyz_SSE[3])); | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment