Skip to content

Instantly share code, notes, and snippets.

@LebedevRI
Created January 16, 2016 19:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LebedevRI/bc8740aa44d889f2f2ea to your computer and use it in GitHub Desktop.
Save LebedevRI/bc8740aa44d889f2f2ea to your computer and use it in GitHub Desktop.
/*
* $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out
*/
#include <stdio.h> // for printf
#include <string.h>
#include <math.h>
#include <xmmintrin.h> // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps
#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
// <= 4 sec to check (1e8 - best)
#define _DATAPOINTS 1e8
#define STOPS ((size_t)(round(cbrtf(_DATAPOINTS))))
static inline __m128 lab_f_m_SSE(const __m128 x);
/*
* interpolate values from p1 and p2 into out.
*/
static float interpolate(const float min, // the smaller tuning
const float max, // the larger tuning (can't be == p1)
const size_t i)
{
// stupid linear interpolation.
// to be confirmed.
const double t = (((double)i) / ((double)(STOPS-1)));
return (1.0 - t) * min + t * max;
}
static inline float cbrtf_fast(const float x)
{
union convert
{
float f;
int i;
} data, dataout;
data.f = x;
// approximate cbrtf(x):
dataout.i =
(
(
(int)
(
(
(float)
(
data.i
)
)
/
3.0f
)
)
+
709921077
);
return dataout.f;
}
static inline float lab_f_m(const float x)
{
const float epsilon = (216.0f / 24389.0f);
const float kappa = (24389.0f / 27.0f);
const float a = cbrtf_fast(x);
const float a3 = a*a*a;
// x > epsilon
const float res_big = (((a) * ((x+x)+a3)) / ((a3+a3)+x));
// x <= epsilon
const float res_small = (((kappa*x) + (16.0f)) / (116.0f));
// blend results according to whether each component is > epsilon or not
return ((x > epsilon) ? res_big : res_small);
}
static inline __m128 dt_XYZ_to_Lab(const __m128 XYZ)
{
const float d50_inv[] = { 1.0f / 0.9642f, 1.0f, 1.0f / 0.8249f, 0.0f };
const float coef[] = { 116.0f, 500.0f, 200.0f, 0.0f };
float farr[4];
_mm_store_ps(farr, XYZ);
for(int c = 0; c < 4; c++)
{
farr[c] = lab_f_m(d50_inv[c]*farr[c]);
}
// because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116*f[0] - 16 equal to 116*(f[0]-f[3])
float sf1[4];
sf1[0] = farr[1];
sf1[1] = farr[0];
sf1[2] = farr[1];
sf1[3] = farr[3];
float sf2[4];
sf2[0] = farr[3];
sf2[1] = farr[1];
sf2[2] = farr[2];
sf2[3] = farr[3];
float fsub[4];
for(int c = 0; c < 4; c++)
{
fsub[c] = (sf1[c]-sf2[c])*coef[c];
}
return _mm_set_ps(0.0f, fsub[2], fsub[1], fsub[0]);
}
//------------------------------------------------------------------------------
static inline __m128 cbrtf_sse(const __m128 x)
{
// approximate cbrtf(x):
return _mm_castsi128_ps(
_mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))),
_mm_set1_epi32(709921077)));
}
static inline __m128 lab_f_m_SSE(const __m128 x)
{
const __m128 epsilon = _mm_set1_ps(216.0f / 24389.0f);
const __m128 kappa = _mm_set1_ps(24389.0f / 27.0f);
// calculate as if x > epsilon : result = cbrtf(x)
// approximate cbrtf(x):
const __m128 a = cbrtf_sse(x);
const __m128 a3 = _mm_mul_ps(_mm_mul_ps(a, a), a);
const __m128 res_big = _mm_div_ps(_mm_mul_ps(a,_mm_add_ps(a3,_mm_add_ps(x,x))),_mm_add_ps(_mm_add_ps(a3, a3), x));
// calculate as if x <= epsilon : result = (kappa*x+16)/116
const __m128 res_small
= _mm_div_ps(
_mm_add_ps(_mm_mul_ps(kappa, x), _mm_set1_ps(16.0f)), _mm_set1_ps(116.0f));
// blend results according to whether each component is > epsilon or not
const __m128 mask = _mm_cmpgt_ps(x, epsilon);
return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
}
static inline __m128 dt_XYZ_to_Lab_SSE(const __m128 XYZ)
{
const __m128 d50_inv = _mm_set_ps(0.0f, 1.0f / 0.8249f, 1.0f, 1.0f / 0.9642f);
const __m128 coef = _mm_set_ps(0.0f, 200.0f, 500.0f, 116.0f);
const __m128 f = lab_f_m_SSE(_mm_mul_ps(XYZ, d50_inv));
// because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116*f[0] - 16 equal to 116*(f[0]-f[3])
return _mm_mul_ps(coef, _mm_sub_ps(_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 1, 0, 1)),
_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 2, 1, 3))));
}
//------------------------------------------------------------------------------
int main()
{
const float L_min = -1.0f;
const float L_max = 2.0f;
const float a_min = -1.0f;
const float a_max = 2.0f;
const float b_min = -1.0f;
const float b_max = 2.0f;
printf("DATA POINTS = %.0e\n", _DATAPOINTS);
printf("STOPS = %zu\n\n", STOPS);
// go through all L
#ifdef _OPENMP
#pragma omp parallel for simd schedule(static) collapse(3)
#endif
for(size_t i = 0; i < (STOPS); i++)
{
// go through all a
for(size_t j = 0; j < STOPS; j++)
{
// go through all b
for(size_t k = 0; k < STOPS; k++)
{
const float L = interpolate(L_min, L_max, i);
const float a = interpolate(a_min, a_max, j);
const float b = interpolate(b_min, b_max, k);
const __m128 Lab = _mm_set_ps(0.0f, b, a, L);
const __m128 XYZ_SSE = dt_XYZ_to_Lab_SSE(Lab);
float _xyz_SSE[4];
_mm_store_ps(_xyz_SSE, XYZ_SSE);
const __m128 XYZ = dt_XYZ_to_Lab(Lab);
float _xyz[4];
_mm_store_ps(_xyz, XYZ);
#define DIFF(x, y) (fmax((x), (y))-fmin((x), (y)))
if(memcmp(&XYZ, &XYZ_SSE, sizeof(__m128)) != 0)
{
printf("XYZ : %6.1f %6.1f %6.1f %6.1f\n", L, a, b, 0.0f);
printf("Lab (SSE): %6.1f %6.1f %6.1f %6.1f\n", _xyz_SSE[0], _xyz_SSE[1], _xyz_SSE[2], _xyz_SSE[3]);
printf("Lab : %6.1f %6.1f %6.1f %6.1f\n", _xyz[0], _xyz[1], _xyz[2], _xyz[3]);
printf("Lab diff : %6.e %6.e %6.e %6.e\n\n", DIFF(_xyz[0], _xyz_SSE[0]), DIFF(_xyz[1], _xyz_SSE[1]), DIFF(_xyz[2], _xyz_SSE[2]), DIFF(_xyz[3], _xyz_SSE[3]));
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment