gatoatigrado/lcms.diff

## lcms.diff
diff --git a/src/cmslut.c b/src/cmslut.c
index 3c0c245..6c7edfd 100644
--- a/src/cmslut.c
+++ b/src/cmslut.c
@@ -24,6 +24,9 @@
 //---------------------------------------------------------------------------------
 //

+#include <stdio.h>
+#include <stdint-gcc.h>
+
 #include "lcms2_internal.h"


@@ -78,28 +81,6 @@ cmsStage* CMSEXPORT cmsStageAllocIdentity(cmsContext ContextID, cmsUInt32Number
                                    NULL);
  }

-// Conversion functions. From floating point to 16 bits
-static
-void FromFloatTo16(const cmsFloat32Number In[], cmsUInt16Number Out[], cmsUInt32Number n)
-{
-    cmsUInt32Number i;
-
-    for (i=0; i < n; i++) {
-        Out[i] = _cmsQuickSaturateWord(In[i] * 65535.0);
-    }
-}
-
-// From 16 bits to floating point
-static
-void From16ToFloat(const cmsUInt16Number In[], cmsFloat32Number Out[], cmsUInt32Number n)
-{
-    cmsUInt32Number i;
-
-    for (i=0; i < n; i++) {
-        Out[i] = (cmsFloat32Number) In[i] / 65535.0F;
-    }
-}
-

 // This function is quite useful to analyze the structure of a LUT and retrieve the MPE elements
 // that conform the LUT. It should be called with the LUT, the number of expected elements and
@@ -308,7 +289,6 @@ cmsStage* CMSEXPORT _cmsStageAllocIdentityCurves(cmsContext ContextID, cmsUInt32


 // Special care should be taken here because precision loss. A temporary cmsFloat64Number buffer is being used
-static
 void EvaluateMatrix(const cmsFloat32Number In[],
                     cmsFloat32Number Out[],
                     const cmsStage *mpe)
@@ -940,7 +920,6 @@ cmsInt32Number CMSEXPORT cmsSliceSpaceFloat(cmsUInt32Number nInputs, const cmsUI
 // ********************************************************************************


-static
 void EvaluateLab2XYZ(const cmsFloat32Number In[],
                      cmsFloat32Number Out[],
                      const cmsStage *mpe)
@@ -1316,7 +1295,6 @@ cmsBool BlessLUT(cmsPipeline* lut)


 // Default to evaluate the LUT on 16 bit-basis. Precision is retained.
-static
 void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Out[],  register const void* D)
 {
     cmsPipeline* lut = (cmsPipeline*) D;
@@ -1326,12 +1304,14 @@ void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Ou

     From16ToFloat(In, &Storage[Phase][0], lut ->InputChannels);

-    for (mpe = lut ->Elements;
+    for (mpe = lut->Elements;
          mpe != NULL;
-         mpe = mpe ->Next) {
+         mpe = mpe->Next) {

              NextPhase = Phase ^ 1;
-             mpe ->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
+            uintptr_t ptr = (uintptr_t)(mpe->EvalPtr);
+            printf("Function: %lx\n", ptr);
+             mpe->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
              Phase = NextPhase;
     }

diff --git a/src/cmsxform.c b/src/cmsxform.c
index 6b2950e..d199365 100644
--- a/src/cmsxform.c
+++ b/src/cmsxform.c
@@ -341,7 +341,7 @@ void NullFloatXFORM(_cmsTRANSFORM* p,

 // 16 bit precision -----------------------------------------------------------------------------------------------------------

-// Null transformation, only applies formatters. No cach�
+// Null transformation, only applies formatters. No cach�
 static
 void NullXFORM(_cmsTRANSFORM* p,
                const void* in,
@@ -400,20 +400,27 @@ void PrecalculatedXFORM(_cmsTRANSFORM* p,
     memset(wIn, 0, sizeof(wIn));
     memset(wOut, 0, sizeof(wOut));

-    for (i = 0; i < LineCount; i++) {
-
-        accum = (cmsUInt8Number*)in + strideIn;
-        output = (cmsUInt8Number*)out + strideOut;
-
-        for (j = 0; j < PixelsPerLine; j++) {
-
-            accum = p->FromInput(p, wIn, accum, Stride->BytesPerPlaneIn);
-            p->Lut->Eval16Fn(wIn, wOut, p->Lut->Data);
-            output = p->ToOutput(p, wOut, output, Stride->BytesPerPlaneOut);
-        }
+#define PRECALCULATED_LOOP_BODY(fcn) \
+    for (i = 0; i < LineCount; i++) { \
+ \
+        accum = (cmsUInt8Number*)in + strideIn; \
+        output = (cmsUInt8Number*)out + strideOut; \
+ \
+        for (j = 0; j < PixelsPerLine; j++) { \
+ \
+            accum = p->FromInput(p, wIn, accum, Stride->BytesPerPlaneIn); \
+            fcn(wIn, wOut, p->Lut->Data); \
+            output = p->ToOutput(p, wOut, output, Stride->BytesPerPlaneOut); \
+        } \
+ \
+        strideIn += Stride->BytesPerLineIn; \
+        strideOut += Stride->BytesPerLineOut; \
+    }

-        strideIn += Stride->BytesPerLineIn;
-        strideOut += Stride->BytesPerLineOut;
+    if (p->Lut->Eval16Fn == &_LUTeval16) {
+        PRECALCULATED_LOOP_BODY(_LUTeval16Inline)
+    } else {
+        PRECALCULATED_LOOP_BODY(p->Lut->Eval16Fn)
     }

 }
@@ -442,7 +449,7 @@ void TransformOnePixelWithGamutCheck(_cmsTRANSFORM* p,
         p ->Lut ->Eval16Fn(wIn, wOut, p -> Lut->Data);
 }

-// Gamut check, No cach�, 16 bits.
+// Gamut check, No cach�, 16 bits.
 static
 void PrecalculatedXFORMGamutCheck(_cmsTRANSFORM* p,
                                   const void* in,
@@ -481,7 +488,7 @@ void PrecalculatedXFORMGamutCheck(_cmsTRANSFORM* p,
 }


-// No gamut check, Cach�, 16 bits,
+// No gamut check, Cach�, 16 bits,
 static
 void CachedXFORM(_cmsTRANSFORM* p,
                  const void* in,
@@ -839,7 +846,7 @@ _cmsTRANSFORM* AllocEmptyTransform(cmsContext ContextID, cmsPipeline* lut,
             p ->xform = NullFloatXFORM;
         }
         else {
-            // Float transforms don't use cach�, always are non-NULL
+            // Float transforms don't use cach�, always are non-NULL
             p ->xform = FloatXFORM;
         }

@@ -878,16 +885,16 @@ _cmsTRANSFORM* AllocEmptyTransform(cmsContext ContextID, cmsPipeline* lut,
             if (*dwFlags & cmsFLAGS_NOCACHE) {

                 if (*dwFlags & cmsFLAGS_GAMUTCHECK)
-                    p ->xform = PrecalculatedXFORMGamutCheck;  // Gamut check, no cach�
+                    p ->xform = PrecalculatedXFORMGamutCheck;  // Gamut check, no cach�
                 else
-                    p ->xform = PrecalculatedXFORM;  // No cach�, no gamut check
+                    p ->xform = PrecalculatedXFORM;  // No cach�, no gamut check
             }
             else {

                 if (*dwFlags & cmsFLAGS_GAMUTCHECK)
-                    p ->xform = CachedXFORMGamutCheck;    // Gamut check, cach�
+                    p ->xform = CachedXFORMGamutCheck;    // Gamut check, cach�
                 else
-                    p ->xform = CachedXFORM;  // No gamut check, cach�
+                    p ->xform = CachedXFORM;  // No gamut check, cach�

             }
         }
diff --git a/src/lcms2_internal.h b/src/lcms2_internal.h
index 5f5270c..a0183b8 100644
--- a/src/lcms2_internal.h
+++ b/src/lcms2_internal.h
@@ -875,6 +875,14 @@ struct _cmsStage_struct {
     struct _cmsStage_struct* Next;
 };

+// For templating
+void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Out[],  register const void* D);
+void EvaluateLab2XYZ(const cmsFloat32Number In[],
+                     cmsFloat32Number Out[],
+                     const cmsStage *mpe);
+void EvaluateMatrix(const cmsFloat32Number In[],
+                    cmsFloat32Number Out[],
+                    const cmsStage *mpe);

 // Special Stages (cannot be saved)
 CMSCHECKPOINT cmsStage*  CMSEXPORT _cmsStageAllocLab2XYZ(cmsContext ContextID);
@@ -1106,6 +1114,145 @@ cmsBool   _cmsAdaptationMatrix(cmsMAT3* r, const cmsMAT3* ConeMatrix, const cmsC

 cmsBool   _cmsBuildRGB2XYZtransferMatrix(cmsMAT3* r, const cmsCIExyY* WhitePoint, const cmsCIExyYTRIPLE* Primaries);

+// Conversion functions. From floating point to 16 bits
+inline __attribute__((always_inline))
+void FromFloatTo16(const cmsFloat32Number In[], cmsUInt16Number Out[], cmsUInt32Number n)
+{
+    cmsUInt32Number i;
+
+    for (i=0; i < n; i++) {
+        Out[i] = _cmsQuickSaturateWord(In[i] * 65535.0);
+    }
+}
+
+// From 16 bits to floating point
+inline __attribute__((always_inline))
+void From16ToFloat(const cmsUInt16Number In[], cmsFloat32Number Out[], cmsUInt32Number n)
+{
+    cmsUInt32Number i;
+
+    for (i=0; i < n; i++) {
+        Out[i] = (cmsFloat32Number) In[i] / 65535.0F;
+    }
+}
+
+inline __attribute__((always_inline))
+cmsFloat64Number f_1Inline(cmsFloat64Number t)
+{
+    const cmsFloat64Number Limit = (24.0/116.0);
+
+    if (t <= Limit) {
+        return (108.0/841.0) * (t - (16.0/116.0));
+    }
+
+    return t * t * t;
+}
+
+inline __attribute__((always_inline))
+void CMSEXPORT cmsLab2XYZInline(const cmsCIEXYZ* WhitePoint, cmsCIEXYZ* xyz,  const cmsCIELab* Lab)
+{
+    cmsFloat64Number x, y, z;
+
+    if (WhitePoint == NULL)
+        WhitePoint = cmsD50_XYZ();
+
+    y = (Lab-> L + 16.0) / 116.0;
+    x = y + 0.002 * Lab -> a;
+    z = y - 0.005 * Lab -> b;
+
+    xyz -> X = f_1Inline(x) * WhitePoint -> X;
+    xyz -> Y = f_1Inline(y) * WhitePoint -> Y;
+    xyz -> Z = f_1Inline(z) * WhitePoint -> Z;
+
+}
+
+inline __attribute__((always_inline))
+void EvaluateLab2XYZInline(const cmsFloat32Number In[],
+                           cmsFloat32Number Out[],
+                           const cmsStage *mpe)
+{
+    cmsCIELab Lab;
+    cmsCIEXYZ XYZ;
+    const cmsFloat64Number XYZadj = MAX_ENCODEABLE_XYZ;
+
+    // V4 rules
+    Lab.L = In[0] * 100.0;
+    Lab.a = In[1] * 255.0 - 128.0;
+    Lab.b = In[2] * 255.0 - 128.0;
+
+    cmsLab2XYZInline(NULL, &XYZ, &Lab);
+
+    // From XYZ, range 0..19997 to 0..1.0, note that 1.99997 comes from 0xffff
+    // encoded as 1.15 fixed point, so 1 + (32767.0 / 32768.0)
+
+    Out[0] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.X / XYZadj);
+    Out[1] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.Y / XYZadj);
+    Out[2] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.Z / XYZadj);
+    return;
+
+    cmsUNUSED_PARAMETER(mpe);
+}
+
+inline __attribute__((always_inline))
+void EvaluateMatrixInline(const cmsFloat32Number In[],
+                    cmsFloat32Number Out[],
+                    const cmsStage *mpe)
+{
+    cmsUInt32Number i, j;
+    _cmsStageMatrixData* Data = (_cmsStageMatrixData*) mpe ->Data;
+    cmsFloat64Number Tmp;
+
+    // Input is already in 0..1.0 notation
+    for (i=0; i < mpe ->OutputChannels; i++) {
+
+        Tmp = 0;
+        for (j=0; j < mpe->InputChannels; j++) {
+            Tmp += In[j] * Data->Double[i*mpe->InputChannels + j];
+        }
+
+        if (Data ->Offset != NULL)
+            Tmp += Data->Offset[i];
+
+        Out[i] = (cmsFloat32Number) Tmp;
+    }
+
+
+    // Output in 0..1.0 domain
+}
+
+inline __attribute__((always_inline))
+void _LUTeval16Inline(
+        register const cmsUInt16Number In[],
+        register cmsUInt16Number Out[],
+        register const void* D)
+{
+    cmsPipeline* lut = (cmsPipeline*) D;
+    cmsStage *mpe;
+    cmsFloat32Number Storage[2][MAX_STAGE_CHANNELS];
+    int Phase = 0, NextPhase;
+
+    From16ToFloat(In, &Storage[Phase][0], lut ->InputChannels);
+
+    for (mpe = lut->Elements;
+         mpe != NULL;
+         mpe = mpe->Next) {
+
+        NextPhase = Phase ^ 1;
+        if (mpe->EvalPtr == &EvaluateLab2XYZ) {
+            EvaluateLab2XYZInline(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
+        } else if (mpe->EvalPtr == &EvaluateMatrix) {
+            EvaluateMatrixInline(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
+        } else {
+            mpe->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
+        }
+        Phase = NextPhase;
+    }
+
+
+    FromFloatTo16(&Storage[Phase][0], Out, lut->OutputChannels);
+}
+
+

 #define _lcms_internal_H
 #endif
	diff --git a/src/cmslut.c b/src/cmslut.c
	index 3c0c245..6c7edfd 100644
	--- a/src/cmslut.c
	+++ b/src/cmslut.c
	@@ -24,6 +24,9 @@
	//---------------------------------------------------------------------------------
	//

	+#include <stdio.h>
	+#include <stdint-gcc.h>
	+
	#include "lcms2_internal.h"


	@@ -78,28 +81,6 @@ cmsStage* CMSEXPORT cmsStageAllocIdentity(cmsContext ContextID, cmsUInt32Number
	NULL);
	}

	-// Conversion functions. From floating point to 16 bits
	-static
	-void FromFloatTo16(const cmsFloat32Number In[], cmsUInt16Number Out[], cmsUInt32Number n)
	-{
	- cmsUInt32Number i;
	-
	- for (i=0; i < n; i++) {
	- Out[i] = _cmsQuickSaturateWord(In[i] * 65535.0);
	- }
	-}
	-
	-// From 16 bits to floating point
	-static
	-void From16ToFloat(const cmsUInt16Number In[], cmsFloat32Number Out[], cmsUInt32Number n)
	-{
	- cmsUInt32Number i;
	-
	- for (i=0; i < n; i++) {
	- Out[i] = (cmsFloat32Number) In[i] / 65535.0F;
	- }
	-}
	-

	// This function is quite useful to analyze the structure of a LUT and retrieve the MPE elements
	// that conform the LUT. It should be called with the LUT, the number of expected elements and
	@@ -308,7 +289,6 @@ cmsStage* CMSEXPORT _cmsStageAllocIdentityCurves(cmsContext ContextID, cmsUInt32


	// Special care should be taken here because precision loss. A temporary cmsFloat64Number buffer is being used
	-static
	void EvaluateMatrix(const cmsFloat32Number In[],
	cmsFloat32Number Out[],
	const cmsStage *mpe)
	@@ -940,7 +920,6 @@ cmsInt32Number CMSEXPORT cmsSliceSpaceFloat(cmsUInt32Number nInputs, const cmsUI
	// ********************************************************************************


	-static
	void EvaluateLab2XYZ(const cmsFloat32Number In[],
	cmsFloat32Number Out[],
	const cmsStage *mpe)
	@@ -1316,7 +1295,6 @@ cmsBool BlessLUT(cmsPipeline* lut)


	// Default to evaluate the LUT on 16 bit-basis. Precision is retained.
	-static
	void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Out[], register const void* D)
	{
	cmsPipeline* lut = (cmsPipeline*) D;
	@@ -1326,12 +1304,14 @@ void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Ou

	From16ToFloat(In, &Storage[Phase][0], lut ->InputChannels);

	- for (mpe = lut ->Elements;
	+ for (mpe = lut->Elements;
	mpe != NULL;
	- mpe = mpe ->Next) {
	+ mpe = mpe->Next) {

	NextPhase = Phase ^ 1;
	- mpe ->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
	+ uintptr_t ptr = (uintptr_t)(mpe->EvalPtr);
	+ printf("Function: %lx\n", ptr);
	+ mpe->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
	Phase = NextPhase;
	}

	diff --git a/src/cmsxform.c b/src/cmsxform.c
	index 6b2950e..d199365 100644
	--- a/src/cmsxform.c
	+++ b/src/cmsxform.c
	@@ -341,7 +341,7 @@ void NullFloatXFORM(_cmsTRANSFORM* p,

	// 16 bit precision -----------------------------------------------------------------------------------------------------------

	-// Null transformation, only applies formatters. No cach�
	+// Null transformation, only applies formatters. No cach�
	static
	void NullXFORM(_cmsTRANSFORM* p,
	const void* in,
	@@ -400,20 +400,27 @@ void PrecalculatedXFORM(_cmsTRANSFORM* p,
	memset(wIn, 0, sizeof(wIn));
	memset(wOut, 0, sizeof(wOut));

	- for (i = 0; i < LineCount; i++) {
	-
	- accum = (cmsUInt8Number*)in + strideIn;
	- output = (cmsUInt8Number*)out + strideOut;
	-
	- for (j = 0; j < PixelsPerLine; j++) {
	-
	- accum = p->FromInput(p, wIn, accum, Stride->BytesPerPlaneIn);
	- p->Lut->Eval16Fn(wIn, wOut, p->Lut->Data);
	- output = p->ToOutput(p, wOut, output, Stride->BytesPerPlaneOut);
	- }
	+#define PRECALCULATED_LOOP_BODY(fcn) \
	+ for (i = 0; i < LineCount; i++) { \
	+ \
	+ accum = (cmsUInt8Number*)in + strideIn; \
	+ output = (cmsUInt8Number*)out + strideOut; \
	+ \
	+ for (j = 0; j < PixelsPerLine; j++) { \
	+ \
	+ accum = p->FromInput(p, wIn, accum, Stride->BytesPerPlaneIn); \
	+ fcn(wIn, wOut, p->Lut->Data); \
	+ output = p->ToOutput(p, wOut, output, Stride->BytesPerPlaneOut); \
	+ } \
	+ \
	+ strideIn += Stride->BytesPerLineIn; \
	+ strideOut += Stride->BytesPerLineOut; \
	+ }

	- strideIn += Stride->BytesPerLineIn;
	- strideOut += Stride->BytesPerLineOut;
	+ if (p->Lut->Eval16Fn == &_LUTeval16) {
	+ PRECALCULATED_LOOP_BODY(_LUTeval16Inline)
	+ } else {
	+ PRECALCULATED_LOOP_BODY(p->Lut->Eval16Fn)
	}

	}
	@@ -442,7 +449,7 @@ void TransformOnePixelWithGamutCheck(_cmsTRANSFORM* p,
	p ->Lut ->Eval16Fn(wIn, wOut, p -> Lut->Data);
	}

	-// Gamut check, No cach�, 16 bits.
	+// Gamut check, No cach�, 16 bits.
	static
	void PrecalculatedXFORMGamutCheck(_cmsTRANSFORM* p,
	const void* in,
	@@ -481,7 +488,7 @@ void PrecalculatedXFORMGamutCheck(_cmsTRANSFORM* p,
	}


	-// No gamut check, Cach�, 16 bits,
	+// No gamut check, Cach�, 16 bits,
	static
	void CachedXFORM(_cmsTRANSFORM* p,
	const void* in,
	@@ -839,7 +846,7 @@ _cmsTRANSFORM* AllocEmptyTransform(cmsContext ContextID, cmsPipeline* lut,
	p ->xform = NullFloatXFORM;
	}
	else {
	- // Float transforms don't use cach�, always are non-NULL
	+ // Float transforms don't use cach�, always are non-NULL
	p ->xform = FloatXFORM;
	}

	@@ -878,16 +885,16 @@ _cmsTRANSFORM* AllocEmptyTransform(cmsContext ContextID, cmsPipeline* lut,
	if (*dwFlags & cmsFLAGS_NOCACHE) {

	if (*dwFlags & cmsFLAGS_GAMUTCHECK)
	- p ->xform = PrecalculatedXFORMGamutCheck; // Gamut check, no cach�
	+ p ->xform = PrecalculatedXFORMGamutCheck; // Gamut check, no cach�
	else
	- p ->xform = PrecalculatedXFORM; // No cach�, no gamut check
	+ p ->xform = PrecalculatedXFORM; // No cach�, no gamut check
	}
	else {

	if (*dwFlags & cmsFLAGS_GAMUTCHECK)
	- p ->xform = CachedXFORMGamutCheck; // Gamut check, cach�
	+ p ->xform = CachedXFORMGamutCheck; // Gamut check, cach�
	else
	- p ->xform = CachedXFORM; // No gamut check, cach�
	+ p ->xform = CachedXFORM; // No gamut check, cach�

	}
	}
	diff --git a/src/lcms2_internal.h b/src/lcms2_internal.h
	index 5f5270c..a0183b8 100644
	--- a/src/lcms2_internal.h
	+++ b/src/lcms2_internal.h
	@@ -875,6 +875,14 @@ struct _cmsStage_struct {
	struct _cmsStage_struct* Next;
	};

	+// For templating
	+void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Out[], register const void* D);
	+void EvaluateLab2XYZ(const cmsFloat32Number In[],
	+ cmsFloat32Number Out[],
	+ const cmsStage *mpe);
	+void EvaluateMatrix(const cmsFloat32Number In[],
	+ cmsFloat32Number Out[],
	+ const cmsStage *mpe);

	// Special Stages (cannot be saved)
	CMSCHECKPOINT cmsStage* CMSEXPORT _cmsStageAllocLab2XYZ(cmsContext ContextID);
	@@ -1106,6 +1114,145 @@ cmsBool _cmsAdaptationMatrix(cmsMAT3* r, const cmsMAT3* ConeMatrix, const cmsC

	cmsBool _cmsBuildRGB2XYZtransferMatrix(cmsMAT3* r, const cmsCIExyY* WhitePoint, const cmsCIExyYTRIPLE* Primaries);

	+// Conversion functions. From floating point to 16 bits
	+inline __attribute__((always_inline))
	+void FromFloatTo16(const cmsFloat32Number In[], cmsUInt16Number Out[], cmsUInt32Number n)
	+{
	+ cmsUInt32Number i;
	+
	+ for (i=0; i < n; i++) {
	+ Out[i] = _cmsQuickSaturateWord(In[i] * 65535.0);
	+ }
	+}
	+
	+// From 16 bits to floating point
	+inline __attribute__((always_inline))
	+void From16ToFloat(const cmsUInt16Number In[], cmsFloat32Number Out[], cmsUInt32Number n)
	+{
	+ cmsUInt32Number i;
	+
	+ for (i=0; i < n; i++) {
	+ Out[i] = (cmsFloat32Number) In[i] / 65535.0F;
	+ }
	+}
	+
	+inline __attribute__((always_inline))
	+cmsFloat64Number f_1Inline(cmsFloat64Number t)
	+{
	+ const cmsFloat64Number Limit = (24.0/116.0);
	+
	+ if (t <= Limit) {
	+ return (108.0/841.0) * (t - (16.0/116.0));
	+ }
	+
	+ return t * t * t;
	+}
	+
	+inline __attribute__((always_inline))
	+void CMSEXPORT cmsLab2XYZInline(const cmsCIEXYZ* WhitePoint, cmsCIEXYZ* xyz, const cmsCIELab* Lab)
	+{
	+ cmsFloat64Number x, y, z;
	+
	+ if (WhitePoint == NULL)
	+ WhitePoint = cmsD50_XYZ();
	+
	+ y = (Lab-> L + 16.0) / 116.0;
	+ x = y + 0.002 * Lab -> a;
	+ z = y - 0.005 * Lab -> b;
	+
	+ xyz -> X = f_1Inline(x) * WhitePoint -> X;
	+ xyz -> Y = f_1Inline(y) * WhitePoint -> Y;
	+ xyz -> Z = f_1Inline(z) * WhitePoint -> Z;
	+
	+}
	+
	+inline __attribute__((always_inline))
	+void EvaluateLab2XYZInline(const cmsFloat32Number In[],
	+ cmsFloat32Number Out[],
	+ const cmsStage *mpe)
	+{
	+ cmsCIELab Lab;
	+ cmsCIEXYZ XYZ;
	+ const cmsFloat64Number XYZadj = MAX_ENCODEABLE_XYZ;
	+
	+ // V4 rules
	+ Lab.L = In[0] * 100.0;
	+ Lab.a = In[1] * 255.0 - 128.0;
	+ Lab.b = In[2] * 255.0 - 128.0;
	+
	+ cmsLab2XYZInline(NULL, &XYZ, &Lab);
	+
	+ // From XYZ, range 0..19997 to 0..1.0, note that 1.99997 comes from 0xffff
	+ // encoded as 1.15 fixed point, so 1 + (32767.0 / 32768.0)
	+
	+ Out[0] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.X / XYZadj);
	+ Out[1] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.Y / XYZadj);
	+ Out[2] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.Z / XYZadj);
	+ return;
	+
	+ cmsUNUSED_PARAMETER(mpe);
	+}
	+
	+inline __attribute__((always_inline))
	+void EvaluateMatrixInline(const cmsFloat32Number In[],
	+ cmsFloat32Number Out[],
	+ const cmsStage *mpe)
	+{
	+ cmsUInt32Number i, j;
	+ _cmsStageMatrixData* Data = (_cmsStageMatrixData*) mpe ->Data;
	+ cmsFloat64Number Tmp;
	+
	+ // Input is already in 0..1.0 notation
	+ for (i=0; i < mpe ->OutputChannels; i++) {
	+
	+ Tmp = 0;
	+ for (j=0; j < mpe->InputChannels; j++) {
	+ Tmp += In[j] * Data->Double[i*mpe->InputChannels + j];
	+ }
	+
	+ if (Data ->Offset != NULL)
	+ Tmp += Data->Offset[i];
	+
	+ Out[i] = (cmsFloat32Number) Tmp;
	+ }
	+
	+
	+ // Output in 0..1.0 domain
	+}
	+
	+inline __attribute__((always_inline))
	+void _LUTeval16Inline(
	+ register const cmsUInt16Number In[],
	+ register cmsUInt16Number Out[],
	+ register const void* D)
	+{
	+ cmsPipeline* lut = (cmsPipeline*) D;
	+ cmsStage *mpe;
	+ cmsFloat32Number Storage[2][MAX_STAGE_CHANNELS];
	+ int Phase = 0, NextPhase;
	+
	+ From16ToFloat(In, &Storage[Phase][0], lut ->InputChannels);
	+
	+ for (mpe = lut->Elements;
	+ mpe != NULL;
	+ mpe = mpe->Next) {
	+
	+ NextPhase = Phase ^ 1;
	+ if (mpe->EvalPtr == &EvaluateLab2XYZ) {
	+ EvaluateLab2XYZInline(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
	+ } else if (mpe->EvalPtr == &EvaluateMatrix) {
	+ EvaluateMatrixInline(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
	+ } else {
	+ mpe->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
	+ }
	+ Phase = NextPhase;
	+ }
	+
	+
	+ FromFloatTo16(&Storage[Phase][0], Out, lut->OutputChannels);
	+}
	+
	+

	#define _lcms_internal_H
	#endif