Last active
January 15, 2016 20:53
-
-
Save LebedevRI/087c6bda959e12768866 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out | |
*/ | |
#include <stdio.h> // for printf | |
#include <string.h> | |
#include <math.h> | |
#include <xmmintrin.h> // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps | |
#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) | |
// <= 4 sec to check (1e8 - best) | |
#define _DATAPOINTS 1e8 | |
#define STOPS ((size_t)(round(cbrtf(_DATAPOINTS)))) | |
/* | |
* interpolate values from p1 and p2 into out. | |
*/ | |
static float interpolate(const float min, // the smaller tuning | |
const float max, // the larger tuning (can't be == p1) | |
const size_t i) | |
{ | |
// stupid linear interpolation. | |
// to be confirmed. | |
const double t = (((double)i) / ((double)(STOPS-1))); | |
return (1.0 - t) * min + t * max; | |
} | |
static inline float lab_f_inv_m(const float x) | |
{ | |
const float epsilon = (0.20689655172413796f); // cbrtf(216.0f/24389.0f); | |
const float kappa_rcp_x16 = (16.0f * 27.0f / 24389.0f); | |
const float kappa_rcp_x116 = (116.0f * 27.0f / 24389.0f); | |
// x > epsilon | |
float res_big = x*x*x; | |
// x <= epsilon | |
float res_small = ((kappa_rcp_x116*x)-kappa_rcp_x16); | |
// blend results according to whether each component is > epsilon or not | |
return((x > epsilon) ? res_big : res_small); | |
} | |
static inline __m128 lab_f_inv_m_SSE(const __m128 x) | |
{ | |
const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f); | |
const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f); | |
const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f); | |
// x > epsilon | |
const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x); | |
// x <= epsilon | |
const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16); | |
// blend results according to whether each component is > epsilon or not | |
const __m128 mask = _mm_cmpgt_ps(x, epsilon); | |
return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small)); | |
} | |
static inline void dt_Lab_to_XYZ(const float *Lab, float *xyz) | |
{ | |
const float d50[] = {0.9642f, 1.0f, 0.8249f, 0.0}; | |
const float coef[] = {1.0f / 500.0f, 1.0f / 116.0f, -1.0f / 200.0f, 0.0f}; | |
const float offset = (0.137931034f); | |
// last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become | |
// 0.0f in f | |
float _F[4]; | |
_F[0] = Lab[1]; | |
_F[1] = Lab[0]; | |
_F[2] = Lab[2]; | |
_F[3] = Lab[0]; | |
for(int c = 0; c < 4; c++) | |
{ | |
_F[c] *= coef[c]; | |
} | |
float _F1[4]; | |
_F1[0] = _F[1]; | |
_F1[1] = 0.0f; | |
_F1[2] = _F[1]; | |
_F1[3] = _F[1]; | |
float _F2[4]; | |
for(int c = 0; c < 4; c++) | |
{ | |
_F2[c] = _F[c]+_F1[c]+offset; | |
} | |
for(int c = 0; c < 3; c++) | |
{ | |
xyz[c] = d50[c] * lab_f_inv_m(_F2[c]); | |
} | |
} | |
static inline __m128 dt_Lab_to_XYZ_SSE(const __m128 Lab) | |
{ | |
const __m128 d50 = _mm_set_ps(0.0f, 0.8249f, 1.0f, 0.9642f); | |
const __m128 coef = _mm_set_ps(0.0f, -1.0f / 200.0f, 1.0f / 116.0f, 1.0f / 500.0f); | |
const __m128 offset = _mm_set1_ps(0.137931034f); | |
// last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become | |
// 0.0f in f | |
const __m128 f = _mm_mul_ps(_mm_shuffle_ps(Lab, Lab, _MM_SHUFFLE(0, 2, 0, 1)), coef); | |
return _mm_mul_ps( | |
d50, lab_f_inv_m_SSE(_mm_add_ps(_mm_add_ps(f, _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 3, 1))), offset))); | |
} | |
int main() | |
{ | |
const float L_min = 0.0f; | |
const float L_max = 100.0f; | |
const float a_min = -128.0f; | |
const float a_max = 128.0f; | |
const float b_min = -128.0f; | |
const float b_max = 128.0f; | |
// const double mul = 1.0f; | |
const double mul = 2.0f; | |
printf("DATA POINTS = %.0e\n", _DATAPOINTS); | |
printf("STOPS = %zu\n\n", STOPS); | |
// go through all L | |
#ifdef _OPENMP | |
#pragma omp parallel for simd schedule(static) collapse(3) | |
#endif | |
for(size_t i = 0; i < (STOPS); i++) | |
{ | |
// go through all a | |
for(size_t j = 0; j < STOPS; j++) | |
{ | |
// go through all b | |
for(size_t k = 0; k < STOPS; k++) | |
{ | |
const float L = interpolate(L_min*mul, L_max*mul, i); | |
const float a = interpolate(a_min*mul, a_max*mul, j); | |
const float b = interpolate(b_min*mul, b_max*mul, k); | |
const __m128 Lab = _mm_set_ps(0.0f, b, a, L); | |
const __m128 XYZ_SSE = dt_Lab_to_XYZ_SSE(Lab); | |
const float Lab_f[] = {L, a, b}; | |
float xyz_f[3]; | |
dt_Lab_to_XYZ(Lab_f, xyz_f); | |
const __m128 XYZ = _mm_set_ps(0.0f, xyz_f[2], xyz_f[1], xyz_f[0]); | |
if(memcmp(&XYZ, &XYZ_SSE, sizeof(__m128)) != 0) printf("%6.1f %6.1f %6.1f\n", L, a, b); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment