LebedevRI/colorout-math.c

## colorout-math.c
/*
 * $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out
 */

#include <stdio.h>      // for printf
#include <string.h>
#include <math.h>
#include <xmmintrin.h>  // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps

#define CLAMP(x, low, high)  (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))

// <= 4 sec to check   (1e8 - best)
#define _DATAPOINTS 1e8
#define STOPS ((size_t)(round(cbrtf(_DATAPOINTS))))

/*
 * interpolate values from p1 and p2 into out.
 */
static float interpolate(const float min, // the smaller tuning
                         const float max, // the larger tuning (can't be == p1)
                         const size_t i)
{
  // stupid linear interpolation.
  // to be confirmed.
  const double t = (((double)i) / ((double)(STOPS-1)));
	return (1.0 - t) * min + t * max;
}

static inline float lab_f_inv_m(const float x)
{
  const float epsilon = (0.20689655172413796f); // cbrtf(216.0f/24389.0f);
  const float kappa_rcp_x16 = (16.0f * 27.0f / 24389.0f);
  const float kappa_rcp_x116 = (116.0f * 27.0f / 24389.0f);

  // x > epsilon
  float res_big = x*x*x;

	// x <= epsilon
  float res_small = ((kappa_rcp_x116*x)-kappa_rcp_x16);

  // blend results according to whether each component is > epsilon or not
	return((x > epsilon) ? res_big : res_small);
}

static inline __m128 lab_f_inv_m_SSE(const __m128 x)
{
  const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f);
  const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f);
  const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f);

  // x > epsilon
  const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x);
  // x <= epsilon
  const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16);

  // blend results according to whether each component is > epsilon or not
  const __m128 mask = _mm_cmpgt_ps(x, epsilon);
  return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
}

static inline void dt_Lab_to_XYZ(const float *Lab, float *xyz)
{
  const float d50[] = {0.9642f, 1.0f, 0.8249f, 0.0};
  const float coef[] = {1.0f / 500.0f, 1.0f / 116.0f, -1.0f / 200.0f, 0.0f};
  const float offset = (0.137931034f);

  // last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become
  // 0.0f in f
  float _F[4];
	_F[0] = Lab[1];
	_F[1] = Lab[0];
	_F[2] = Lab[2];
	_F[3] = Lab[0];

	for(int c = 0; c < 4; c++)
	{
		_F[c] *= coef[c];
	}

	float _F1[4];
	_F1[0] = _F[1];
	_F1[1] = 0.0f;
	_F1[2] = _F[1];
	_F1[3] = _F[1];

	float _F2[4];
	for(int c = 0; c < 4; c++)
	{
		_F2[c] = _F[c]+_F1[c]+offset;
	}

	for(int c = 0; c < 3; c++)
	{
		xyz[c] = d50[c] * lab_f_inv_m(_F2[c]);
	}
}

static inline __m128 dt_Lab_to_XYZ_SSE(const __m128 Lab)
{
  const __m128 d50 = _mm_set_ps(0.0f, 0.8249f, 1.0f, 0.9642f);
  const __m128 coef = _mm_set_ps(0.0f, -1.0f / 200.0f, 1.0f / 116.0f, 1.0f / 500.0f);
  const __m128 offset = _mm_set1_ps(0.137931034f);

  // last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become
  // 0.0f in f
  const __m128 f = _mm_mul_ps(_mm_shuffle_ps(Lab, Lab, _MM_SHUFFLE(0, 2, 0, 1)), coef);

  return _mm_mul_ps(
      d50, lab_f_inv_m_SSE(_mm_add_ps(_mm_add_ps(f, _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 3, 1))), offset)));
}

int main()
{
	const float L_min = 0.0f;
	const float L_max = 100.0f;
	const float a_min = -128.0f;
	const float a_max = 128.0f;
	const float b_min = -128.0f;
	const float b_max = 128.0f;

	// const double mul = 1.0f;
	const double mul = 2.0f;

	printf("DATA POINTS = %.0e\n", _DATAPOINTS);
	printf("STOPS = %zu\n\n", STOPS);

	// go through all L
#ifdef _OPENMP
#pragma omp parallel for simd schedule(static) collapse(3)
#endif
	for(size_t i = 0; i < (STOPS); i++)
	{
		// go through all a
		for(size_t j = 0; j < STOPS; j++)
		{
			// go through all b
			for(size_t k = 0; k < STOPS; k++)
			{
				const float L = interpolate(L_min*mul, L_max*mul, i);
				const float a = interpolate(a_min*mul, a_max*mul, j);
				const float b = interpolate(b_min*mul, b_max*mul, k);

				const __m128 Lab = _mm_set_ps(0.0f, b, a, L);
				const __m128 XYZ_SSE = dt_Lab_to_XYZ_SSE(Lab);

				const float Lab_f[] = {L, a, b};
				float xyz_f[3];
				dt_Lab_to_XYZ(Lab_f, xyz_f);
				const __m128 XYZ = _mm_set_ps(0.0f, xyz_f[2], xyz_f[1], xyz_f[0]);


				if(memcmp(&XYZ, &XYZ_SSE, sizeof(__m128)) != 0) printf("%6.1f %6.1f %6.1f\n", L, a, b);
			}
		}
	}
}
	/*
	* $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out
	*/

	#include <stdio.h> // for printf
	#include <string.h>
	#include <math.h>
	#include <xmmintrin.h> // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps

	#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))

	// <= 4 sec to check (1e8 - best)
	#define _DATAPOINTS 1e8
	#define STOPS ((size_t)(round(cbrtf(_DATAPOINTS))))

	/*
	* interpolate values from p1 and p2 into out.
	*/
	static float interpolate(const float min, // the smaller tuning
	const float max, // the larger tuning (can't be == p1)
	const size_t i)
	{
	// stupid linear interpolation.
	// to be confirmed.
	const double t = (((double)i) / ((double)(STOPS-1)));
	return (1.0 - t) * min + t * max;
	}

	static inline float lab_f_inv_m(const float x)
	{
	const float epsilon = (0.20689655172413796f); // cbrtf(216.0f/24389.0f);
	const float kappa_rcp_x16 = (16.0f * 27.0f / 24389.0f);
	const float kappa_rcp_x116 = (116.0f * 27.0f / 24389.0f);

	// x > epsilon
	float res_big = xxx;

	// x <= epsilon
	float res_small = ((kappa_rcp_x116*x)-kappa_rcp_x16);

	// blend results according to whether each component is > epsilon or not
	return((x > epsilon) ? res_big : res_small);
	}

	static inline __m128 lab_f_inv_m_SSE(const __m128 x)
	{
	const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f);
	const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f);
	const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f);

	// x > epsilon
	const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x);
	// x <= epsilon
	const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16);

	// blend results according to whether each component is > epsilon or not
	const __m128 mask = _mm_cmpgt_ps(x, epsilon);
	return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
	}

	static inline void dt_Lab_to_XYZ(const float Lab, float xyz)
	{
	const float d50[] = {0.9642f, 1.0f, 0.8249f, 0.0};
	const float coef[] = {1.0f / 500.0f, 1.0f / 116.0f, -1.0f / 200.0f, 0.0f};
	const float offset = (0.137931034f);

	// last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become
	// 0.0f in f
	float _F[4];
	_F[0] = Lab[1];
	_F[1] = Lab[0];
	_F[2] = Lab[2];
	_F[3] = Lab[0];

	for(int c = 0; c < 4; c++)
	{
	_F[c] *= coef[c];
	}

	float _F1[4];
	_F1[0] = _F[1];
	_F1[1] = 0.0f;
	_F1[2] = _F[1];
	_F1[3] = _F[1];

	float _F2[4];
	for(int c = 0; c < 4; c++)
	{
	_F2[c] = _F[c]+_F1[c]+offset;
	}

	for(int c = 0; c < 3; c++)
	{
	xyz[c] = d50[c] * lab_f_inv_m(_F2[c]);
	}
	}

	static inline __m128 dt_Lab_to_XYZ_SSE(const __m128 Lab)
	{
	const __m128 d50 = _mm_set_ps(0.0f, 0.8249f, 1.0f, 0.9642f);
	const __m128 coef = _mm_set_ps(0.0f, -1.0f / 200.0f, 1.0f / 116.0f, 1.0f / 500.0f);
	const __m128 offset = _mm_set1_ps(0.137931034f);

	// last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become
	// 0.0f in f
	const __m128 f = _mm_mul_ps(_mm_shuffle_ps(Lab, Lab, _MM_SHUFFLE(0, 2, 0, 1)), coef);

	return _mm_mul_ps(
	d50, lab_f_inv_m_SSE(_mm_add_ps(_mm_add_ps(f, _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 3, 1))), offset)));
	}

	int main()
	{
	const float L_min = 0.0f;
	const float L_max = 100.0f;
	const float a_min = -128.0f;
	const float a_max = 128.0f;
	const float b_min = -128.0f;
	const float b_max = 128.0f;

	// const double mul = 1.0f;
	const double mul = 2.0f;

	printf("DATA POINTS = %.0e\n", _DATAPOINTS);
	printf("STOPS = %zu\n\n", STOPS);

	// go through all L
	#ifdef _OPENMP
	#pragma omp parallel for simd schedule(static) collapse(3)
	#endif
	for(size_t i = 0; i < (STOPS); i++)
	{
	// go through all a
	for(size_t j = 0; j < STOPS; j++)
	{
	// go through all b
	for(size_t k = 0; k < STOPS; k++)
	{
	const float L = interpolate(L_minmul, L_maxmul, i);
	const float a = interpolate(a_minmul, a_maxmul, j);
	const float b = interpolate(b_minmul, b_maxmul, k);

	const __m128 Lab = _mm_set_ps(0.0f, b, a, L);
	const __m128 XYZ_SSE = dt_Lab_to_XYZ_SSE(Lab);

	const float Lab_f[] = {L, a, b};
	float xyz_f[3];
	dt_Lab_to_XYZ(Lab_f, xyz_f);
	const __m128 XYZ = _mm_set_ps(0.0f, xyz_f[2], xyz_f[1], xyz_f[0]);


	if(memcmp(&XYZ, &XYZ_SSE, sizeof(__m128)) != 0) printf("%6.1f %6.1f %6.1f\n", L, a, b);
	}
	}
	}
	}