LebedevRI/colorin-math.c

## colorin-math.c
/*
 * $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out
 */

#include <stdio.h>      // for printf
#include <string.h>
#include <math.h>
#include <xmmintrin.h>  // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps

#define CLAMP(x, low, high)  (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))

// <= 4 sec to check   (1e8 - best)
#define _DATAPOINTS 1e8
#define STOPS ((size_t)(round(cbrtf(_DATAPOINTS))))

static inline __m128 lab_f_m_SSE(const __m128 x);

/*
 * interpolate values from p1 and p2 into out.
 */
static float interpolate(const float min, // the smaller tuning
                         const float max, // the larger tuning (can't be == p1)
                         const size_t i)
{
  // stupid linear interpolation.
  // to be confirmed.
  const double t = (((double)i) / ((double)(STOPS-1)));
	return (1.0 - t) * min + t * max;
}

static inline float cbrtf_fast(const float x)
{
	union convert
  {
    float f;
    int i;
  } data, dataout;

	data.f = x;

  // approximate cbrtf(x):
  dataout.i =
					(
						(
							(int)
							(
								(
									(float)
									(
										data.i
									)
								)
								/
								3.0f
							)
						)
						+
						709921077
					);

	return dataout.f;
}

static inline float lab_f_m(const float x)
{
  const float epsilon = (216.0f / 24389.0f);
  const float kappa = (24389.0f / 27.0f);

	const float a = cbrtf_fast(x);
	const float a3 = a*a*a;

  // x > epsilon
  const float res_big = (((a) * ((x+x)+a3)) / ((a3+a3)+x));

	// x <= epsilon
  const float res_small = (((kappa*x) + (16.0f)) / (116.0f));

  // blend results according to whether each component is > epsilon or not
	return ((x > epsilon) ? res_big : res_small);
}

static inline __m128 dt_XYZ_to_Lab(const __m128 XYZ)
{
  const float d50_inv[] = { 1.0f / 0.9642f, 1.0f, 1.0f / 0.8249f, 0.0f };
  const float coef[] = { 116.0f, 500.0f, 200.0f, 0.0f };

	float farr[4];
	_mm_store_ps(farr, XYZ);

  for(int c = 0; c < 4; c++)
  {
    farr[c] = lab_f_m(d50_inv[c]*farr[c]);
  }

  // because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116*f[0] - 16 equal to 116*(f[0]-f[3])

  float sf1[4];
	sf1[0] = farr[1];
	sf1[1] = farr[0];
	sf1[2] = farr[1];
	sf1[3] = farr[3];

	float sf2[4];
	sf2[0] = farr[3];
	sf2[1] = farr[1];
	sf2[2] = farr[2];
	sf2[3] = farr[3];

	float fsub[4];
  for(int c = 0; c < 4; c++)
  {
    fsub[c] = (sf1[c]-sf2[c])*coef[c];
  }

	return _mm_set_ps(0.0f, fsub[2], fsub[1], fsub[0]);
}

//------------------------------------------------------------------------------
static inline __m128 cbrtf_sse(const __m128 x)
{
  // approximate cbrtf(x):
  return _mm_castsi128_ps(
      _mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))),
                    _mm_set1_epi32(709921077)));
}

static inline __m128 lab_f_m_SSE(const __m128 x)
{
  const __m128 epsilon = _mm_set1_ps(216.0f / 24389.0f);
  const __m128 kappa = _mm_set1_ps(24389.0f / 27.0f);

  // calculate as if x > epsilon : result = cbrtf(x)
  // approximate cbrtf(x):
  const __m128 a = cbrtf_sse(x);

  const __m128 a3 = _mm_mul_ps(_mm_mul_ps(a, a), a);
  const __m128 res_big = _mm_div_ps(_mm_mul_ps(a,_mm_add_ps(a3,_mm_add_ps(x,x))),_mm_add_ps(_mm_add_ps(a3, a3), x));

  // calculate as if x <= epsilon : result = (kappa*x+16)/116
  const __m128 res_small
      = _mm_div_ps(
				_mm_add_ps(_mm_mul_ps(kappa, x), _mm_set1_ps(16.0f)), _mm_set1_ps(116.0f));

  // blend results according to whether each component is > epsilon or not
  const __m128 mask = _mm_cmpgt_ps(x, epsilon);
  return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
}

static inline __m128 dt_XYZ_to_Lab_SSE(const __m128 XYZ)
{
  const __m128 d50_inv = _mm_set_ps(0.0f, 1.0f / 0.8249f, 1.0f, 1.0f / 0.9642f);
  const __m128 coef = _mm_set_ps(0.0f, 200.0f, 500.0f, 116.0f);
  const __m128 f = lab_f_m_SSE(_mm_mul_ps(XYZ, d50_inv));
  // because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116*f[0] - 16 equal to 116*(f[0]-f[3])
  return _mm_mul_ps(coef, _mm_sub_ps(_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 1, 0, 1)),
                                     _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 2, 1, 3))));
}
//------------------------------------------------------------------------------

int main()
{
	const float L_min = -1.0f;
	const float L_max = 2.0f;
	const float a_min = -1.0f;
	const float a_max = 2.0f;
	const float b_min = -1.0f;
	const float b_max = 2.0f;

	printf("DATA POINTS = %.0e\n", _DATAPOINTS);
	printf("STOPS = %zu\n\n", STOPS);

	// go through all L
#ifdef _OPENMP
#pragma omp parallel for simd schedule(static) collapse(3)
#endif
	for(size_t i = 0; i < (STOPS); i++)
	{
		// go through all a
		for(size_t j = 0; j < STOPS; j++)
		{
			// go through all b
			for(size_t k = 0; k < STOPS; k++)
			{
				const float L = interpolate(L_min, L_max, i);
				const float a = interpolate(a_min, a_max, j);
				const float b = interpolate(b_min, b_max, k);

				const __m128 Lab = _mm_set_ps(0.0f, b, a, L);
				const __m128 XYZ_SSE = dt_XYZ_to_Lab_SSE(Lab);
				float _xyz_SSE[4];
				_mm_store_ps(_xyz_SSE, XYZ_SSE);

				const __m128 XYZ = dt_XYZ_to_Lab(Lab);
				float _xyz[4];
				_mm_store_ps(_xyz, XYZ);

#define DIFF(x, y) (fmax((x), (y))-fmin((x), (y)))

				if(memcmp(&XYZ, &XYZ_SSE, sizeof(__m128)) != 0)
				{
					printf("XYZ      : %6.1f %6.1f %6.1f %6.1f\n", L, a, b, 0.0f);
					printf("Lab (SSE): %6.1f %6.1f %6.1f %6.1f\n", _xyz_SSE[0], _xyz_SSE[1], _xyz_SSE[2], _xyz_SSE[3]);
					printf("Lab      : %6.1f %6.1f %6.1f %6.1f\n", _xyz[0], _xyz[1], _xyz[2], _xyz[3]);
					printf("Lab diff : %6.e %6.e %6.e %6.e\n\n", DIFF(_xyz[0], _xyz_SSE[0]), DIFF(_xyz[1], _xyz_SSE[1]), DIFF(_xyz[2], _xyz_SSE[2]), DIFF(_xyz[3], _xyz_SSE[3]));
				}
			}
		}
	}
}
	/*
	* $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out
	*/

	#include <stdio.h> // for printf
	#include <string.h>
	#include <math.h>
	#include <xmmintrin.h> // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps

	#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))

	// <= 4 sec to check (1e8 - best)
	#define _DATAPOINTS 1e8
	#define STOPS ((size_t)(round(cbrtf(_DATAPOINTS))))

	static inline __m128 lab_f_m_SSE(const __m128 x);

	/*
	* interpolate values from p1 and p2 into out.
	*/
	static float interpolate(const float min, // the smaller tuning
	const float max, // the larger tuning (can't be == p1)
	const size_t i)
	{
	// stupid linear interpolation.
	// to be confirmed.
	const double t = (((double)i) / ((double)(STOPS-1)));
	return (1.0 - t) * min + t * max;
	}

	static inline float cbrtf_fast(const float x)
	{
	union convert
	{
	float f;
	int i;
	} data, dataout;

	data.f = x;

	// approximate cbrtf(x):
	dataout.i =
	(
	(
	(int)
	(
	(
	(float)
	(
	data.i
	)
	)
	/
	3.0f
	)
	)
	+
	709921077
	);

	return dataout.f;
	}

	static inline float lab_f_m(const float x)
	{
	const float epsilon = (216.0f / 24389.0f);
	const float kappa = (24389.0f / 27.0f);

	const float a = cbrtf_fast(x);
	const float a3 = aaa;

	// x > epsilon
	const float res_big = (((a) * ((x+x)+a3)) / ((a3+a3)+x));

	// x <= epsilon
	const float res_small = (((kappa*x) + (16.0f)) / (116.0f));

	// blend results according to whether each component is > epsilon or not
	return ((x > epsilon) ? res_big : res_small);
	}

	static inline __m128 dt_XYZ_to_Lab(const __m128 XYZ)
	{
	const float d50_inv[] = { 1.0f / 0.9642f, 1.0f, 1.0f / 0.8249f, 0.0f };
	const float coef[] = { 116.0f, 500.0f, 200.0f, 0.0f };

	float farr[4];
	_mm_store_ps(farr, XYZ);

	for(int c = 0; c < 4; c++)
	{
	farr[c] = lab_f_m(d50_inv[c]*farr[c]);
	}

	// because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116f[0] - 16 equal to 116(f[0]-f[3])

	float sf1[4];
	sf1[0] = farr[1];
	sf1[1] = farr[0];
	sf1[2] = farr[1];
	sf1[3] = farr[3];

	float sf2[4];
	sf2[0] = farr[3];
	sf2[1] = farr[1];
	sf2[2] = farr[2];
	sf2[3] = farr[3];

	float fsub[4];
	for(int c = 0; c < 4; c++)
	{
	fsub[c] = (sf1[c]-sf2[c])*coef[c];
	}

	return _mm_set_ps(0.0f, fsub[2], fsub[1], fsub[0]);
	}

	//------------------------------------------------------------------------------
	static inline __m128 cbrtf_sse(const __m128 x)
	{
	// approximate cbrtf(x):
	return _mm_castsi128_ps(
	_mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))),
	_mm_set1_epi32(709921077)));
	}

	static inline __m128 lab_f_m_SSE(const __m128 x)
	{
	const __m128 epsilon = _mm_set1_ps(216.0f / 24389.0f);
	const __m128 kappa = _mm_set1_ps(24389.0f / 27.0f);

	// calculate as if x > epsilon : result = cbrtf(x)
	// approximate cbrtf(x):
	const __m128 a = cbrtf_sse(x);

	const __m128 a3 = _mm_mul_ps(_mm_mul_ps(a, a), a);
	const __m128 res_big = _mm_div_ps(_mm_mul_ps(a,_mm_add_ps(a3,_mm_add_ps(x,x))),_mm_add_ps(_mm_add_ps(a3, a3), x));

	// calculate as if x <= epsilon : result = (kappa*x+16)/116
	const __m128 res_small
	= _mm_div_ps(
	_mm_add_ps(_mm_mul_ps(kappa, x), _mm_set1_ps(16.0f)), _mm_set1_ps(116.0f));

	// blend results according to whether each component is > epsilon or not
	const __m128 mask = _mm_cmpgt_ps(x, epsilon);
	return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
	}

	static inline __m128 dt_XYZ_to_Lab_SSE(const __m128 XYZ)
	{
	const __m128 d50_inv = _mm_set_ps(0.0f, 1.0f / 0.8249f, 1.0f, 1.0f / 0.9642f);
	const __m128 coef = _mm_set_ps(0.0f, 200.0f, 500.0f, 116.0f);
	const __m128 f = lab_f_m_SSE(_mm_mul_ps(XYZ, d50_inv));
	// because d50_inv.z is 0.0f, lab_f(0) == 16/116, so Lab[0] = 116f[0] - 16 equal to 116(f[0]-f[3])
	return _mm_mul_ps(coef, _mm_sub_ps(_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 1, 0, 1)),
	_mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 2, 1, 3))));
	}
	//------------------------------------------------------------------------------

	int main()
	{
	const float L_min = -1.0f;
	const float L_max = 2.0f;
	const float a_min = -1.0f;
	const float a_max = 2.0f;
	const float b_min = -1.0f;
	const float b_max = 2.0f;

	printf("DATA POINTS = %.0e\n", _DATAPOINTS);
	printf("STOPS = %zu\n\n", STOPS);

	// go through all L
	#ifdef _OPENMP
	#pragma omp parallel for simd schedule(static) collapse(3)
	#endif
	for(size_t i = 0; i < (STOPS); i++)
	{
	// go through all a
	for(size_t j = 0; j < STOPS; j++)
	{
	// go through all b
	for(size_t k = 0; k < STOPS; k++)
	{
	const float L = interpolate(L_min, L_max, i);
	const float a = interpolate(a_min, a_max, j);
	const float b = interpolate(b_min, b_max, k);

	const __m128 Lab = _mm_set_ps(0.0f, b, a, L);
	const __m128 XYZ_SSE = dt_XYZ_to_Lab_SSE(Lab);
	float _xyz_SSE[4];
	_mm_store_ps(_xyz_SSE, XYZ_SSE);

	const __m128 XYZ = dt_XYZ_to_Lab(Lab);
	float _xyz[4];
	_mm_store_ps(_xyz, XYZ);

	#define DIFF(x, y) (fmax((x), (y))-fmin((x), (y)))

	if(memcmp(&XYZ, &XYZ_SSE, sizeof(__m128)) != 0)
	{
	printf("XYZ : %6.1f %6.1f %6.1f %6.1f\n", L, a, b, 0.0f);
	printf("Lab (SSE): %6.1f %6.1f %6.1f %6.1f\n", _xyz_SSE[0], _xyz_SSE[1], _xyz_SSE[2], _xyz_SSE[3]);
	printf("Lab : %6.1f %6.1f %6.1f %6.1f\n", _xyz[0], _xyz[1], _xyz[2], _xyz[3]);
	printf("Lab diff : %6.e %6.e %6.e %6.e\n\n", DIFF(_xyz[0], _xyz_SSE[0]), DIFF(_xyz[1], _xyz_SSE[1]), DIFF(_xyz[2], _xyz_SSE[2]), DIFF(_xyz[3], _xyz_SSE[3]));
	}
	}
	}
	}
	}