Skip to content

Instantly share code, notes, and snippets.

@LebedevRI
Last active January 15, 2016 20:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LebedevRI/087c6bda959e12768866 to your computer and use it in GitHub Desktop.
Save LebedevRI/087c6bda959e12768866 to your computer and use it in GitHub Desktop.
/*
* $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out
*/
#include <stdio.h> // for printf
#include <string.h>
#include <math.h>
#include <xmmintrin.h> // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps
#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
// <= 4 sec to check (1e8 - best)
#define _DATAPOINTS 1e8
#define STOPS ((size_t)(round(cbrtf(_DATAPOINTS))))
/*
* interpolate values from p1 and p2 into out.
*/
static float interpolate(const float min, // the smaller tuning
const float max, // the larger tuning (can't be == p1)
const size_t i)
{
// stupid linear interpolation.
// to be confirmed.
const double t = (((double)i) / ((double)(STOPS-1)));
return (1.0 - t) * min + t * max;
}
static inline float lab_f_inv_m(const float x)
{
const float epsilon = (0.20689655172413796f); // cbrtf(216.0f/24389.0f);
const float kappa_rcp_x16 = (16.0f * 27.0f / 24389.0f);
const float kappa_rcp_x116 = (116.0f * 27.0f / 24389.0f);
// x > epsilon
float res_big = x*x*x;
// x <= epsilon
float res_small = ((kappa_rcp_x116*x)-kappa_rcp_x16);
// blend results according to whether each component is > epsilon or not
return((x > epsilon) ? res_big : res_small);
}
static inline __m128 lab_f_inv_m_SSE(const __m128 x)
{
const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f);
const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f);
const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f);
// x > epsilon
const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x);
// x <= epsilon
const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16);
// blend results according to whether each component is > epsilon or not
const __m128 mask = _mm_cmpgt_ps(x, epsilon);
return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
}
static inline void dt_Lab_to_XYZ(const float *Lab, float *xyz)
{
const float d50[] = {0.9642f, 1.0f, 0.8249f, 0.0};
const float coef[] = {1.0f / 500.0f, 1.0f / 116.0f, -1.0f / 200.0f, 0.0f};
const float offset = (0.137931034f);
// last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become
// 0.0f in f
float _F[4];
_F[0] = Lab[1];
_F[1] = Lab[0];
_F[2] = Lab[2];
_F[3] = Lab[0];
for(int c = 0; c < 4; c++)
{
_F[c] *= coef[c];
}
float _F1[4];
_F1[0] = _F[1];
_F1[1] = 0.0f;
_F1[2] = _F[1];
_F1[3] = _F[1];
float _F2[4];
for(int c = 0; c < 4; c++)
{
_F2[c] = _F[c]+_F1[c]+offset;
}
for(int c = 0; c < 3; c++)
{
xyz[c] = d50[c] * lab_f_inv_m(_F2[c]);
}
}
static inline __m128 dt_Lab_to_XYZ_SSE(const __m128 Lab)
{
const __m128 d50 = _mm_set_ps(0.0f, 0.8249f, 1.0f, 0.9642f);
const __m128 coef = _mm_set_ps(0.0f, -1.0f / 200.0f, 1.0f / 116.0f, 1.0f / 500.0f);
const __m128 offset = _mm_set1_ps(0.137931034f);
// last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become
// 0.0f in f
const __m128 f = _mm_mul_ps(_mm_shuffle_ps(Lab, Lab, _MM_SHUFFLE(0, 2, 0, 1)), coef);
return _mm_mul_ps(
d50, lab_f_inv_m_SSE(_mm_add_ps(_mm_add_ps(f, _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 3, 1))), offset)));
}
int main()
{
const float L_min = 0.0f;
const float L_max = 100.0f;
const float a_min = -128.0f;
const float a_max = 128.0f;
const float b_min = -128.0f;
const float b_max = 128.0f;
// const double mul = 1.0f;
const double mul = 2.0f;
printf("DATA POINTS = %.0e\n", _DATAPOINTS);
printf("STOPS = %zu\n\n", STOPS);
// go through all L
#ifdef _OPENMP
#pragma omp parallel for simd schedule(static) collapse(3)
#endif
for(size_t i = 0; i < (STOPS); i++)
{
// go through all a
for(size_t j = 0; j < STOPS; j++)
{
// go through all b
for(size_t k = 0; k < STOPS; k++)
{
const float L = interpolate(L_min*mul, L_max*mul, i);
const float a = interpolate(a_min*mul, a_max*mul, j);
const float b = interpolate(b_min*mul, b_max*mul, k);
const __m128 Lab = _mm_set_ps(0.0f, b, a, L);
const __m128 XYZ_SSE = dt_Lab_to_XYZ_SSE(Lab);
const float Lab_f[] = {L, a, b};
float xyz_f[3];
dt_Lab_to_XYZ(Lab_f, xyz_f);
const __m128 XYZ = _mm_set_ps(0.0f, xyz_f[2], xyz_f[1], xyz_f[0]);
if(memcmp(&XYZ, &XYZ_SSE, sizeof(__m128)) != 0) printf("%6.1f %6.1f %6.1f\n", L, a, b);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment