Skip to content

Instantly share code, notes, and snippets.

@gatoatigrado
Created December 24, 2017 18:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gatoatigrado/5ed4f608f06a5e68586ad3e850478b10 to your computer and use it in GitHub Desktop.
Save gatoatigrado/5ed4f608f06a5e68586ad3e850478b10 to your computer and use it in GitHub Desktop.
LCMS manual function inlining
diff --git a/src/cmslut.c b/src/cmslut.c
index 3c0c245..6c7edfd 100644
--- a/src/cmslut.c
+++ b/src/cmslut.c
@@ -24,6 +24,9 @@
//---------------------------------------------------------------------------------
//
+#include <stdio.h>
+#include <stdint-gcc.h>
+
#include "lcms2_internal.h"
@@ -78,28 +81,6 @@ cmsStage* CMSEXPORT cmsStageAllocIdentity(cmsContext ContextID, cmsUInt32Number
NULL);
}
-// Conversion functions. From floating point to 16 bits
-static
-void FromFloatTo16(const cmsFloat32Number In[], cmsUInt16Number Out[], cmsUInt32Number n)
-{
- cmsUInt32Number i;
-
- for (i=0; i < n; i++) {
- Out[i] = _cmsQuickSaturateWord(In[i] * 65535.0);
- }
-}
-
-// From 16 bits to floating point
-static
-void From16ToFloat(const cmsUInt16Number In[], cmsFloat32Number Out[], cmsUInt32Number n)
-{
- cmsUInt32Number i;
-
- for (i=0; i < n; i++) {
- Out[i] = (cmsFloat32Number) In[i] / 65535.0F;
- }
-}
-
// This function is quite useful to analyze the structure of a LUT and retrieve the MPE elements
// that conform the LUT. It should be called with the LUT, the number of expected elements and
@@ -308,7 +289,6 @@ cmsStage* CMSEXPORT _cmsStageAllocIdentityCurves(cmsContext ContextID, cmsUInt32
// Special care should be taken here because precision loss. A temporary cmsFloat64Number buffer is being used
-static
void EvaluateMatrix(const cmsFloat32Number In[],
cmsFloat32Number Out[],
const cmsStage *mpe)
@@ -940,7 +920,6 @@ cmsInt32Number CMSEXPORT cmsSliceSpaceFloat(cmsUInt32Number nInputs, const cmsUI
// ********************************************************************************
-static
void EvaluateLab2XYZ(const cmsFloat32Number In[],
cmsFloat32Number Out[],
const cmsStage *mpe)
@@ -1316,7 +1295,6 @@ cmsBool BlessLUT(cmsPipeline* lut)
// Default to evaluate the LUT on 16 bit-basis. Precision is retained.
-static
void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Out[], register const void* D)
{
cmsPipeline* lut = (cmsPipeline*) D;
@@ -1326,12 +1304,14 @@ void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Ou
From16ToFloat(In, &Storage[Phase][0], lut ->InputChannels);
- for (mpe = lut ->Elements;
+ for (mpe = lut->Elements;
mpe != NULL;
- mpe = mpe ->Next) {
+ mpe = mpe->Next) {
NextPhase = Phase ^ 1;
- mpe ->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
+ uintptr_t ptr = (uintptr_t)(mpe->EvalPtr);
+ printf("Function: %lx\n", ptr);
+ mpe->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
Phase = NextPhase;
}
diff --git a/src/cmsxform.c b/src/cmsxform.c
index 6b2950e..d199365 100644
--- a/src/cmsxform.c
+++ b/src/cmsxform.c
@@ -341,7 +341,7 @@ void NullFloatXFORM(_cmsTRANSFORM* p,
// 16 bit precision -----------------------------------------------------------------------------------------------------------
-// Null transformation, only applies formatters. No cach�
+// Null transformation, only applies formatters. No cach�
static
void NullXFORM(_cmsTRANSFORM* p,
const void* in,
@@ -400,20 +400,27 @@ void PrecalculatedXFORM(_cmsTRANSFORM* p,
memset(wIn, 0, sizeof(wIn));
memset(wOut, 0, sizeof(wOut));
- for (i = 0; i < LineCount; i++) {
-
- accum = (cmsUInt8Number*)in + strideIn;
- output = (cmsUInt8Number*)out + strideOut;
-
- for (j = 0; j < PixelsPerLine; j++) {
-
- accum = p->FromInput(p, wIn, accum, Stride->BytesPerPlaneIn);
- p->Lut->Eval16Fn(wIn, wOut, p->Lut->Data);
- output = p->ToOutput(p, wOut, output, Stride->BytesPerPlaneOut);
- }
+#define PRECALCULATED_LOOP_BODY(fcn) \
+ for (i = 0; i < LineCount; i++) { \
+ \
+ accum = (cmsUInt8Number*)in + strideIn; \
+ output = (cmsUInt8Number*)out + strideOut; \
+ \
+ for (j = 0; j < PixelsPerLine; j++) { \
+ \
+ accum = p->FromInput(p, wIn, accum, Stride->BytesPerPlaneIn); \
+ fcn(wIn, wOut, p->Lut->Data); \
+ output = p->ToOutput(p, wOut, output, Stride->BytesPerPlaneOut); \
+ } \
+ \
+ strideIn += Stride->BytesPerLineIn; \
+ strideOut += Stride->BytesPerLineOut; \
+ }
- strideIn += Stride->BytesPerLineIn;
- strideOut += Stride->BytesPerLineOut;
+ if (p->Lut->Eval16Fn == &_LUTeval16) {
+ PRECALCULATED_LOOP_BODY(_LUTeval16Inline)
+ } else {
+ PRECALCULATED_LOOP_BODY(p->Lut->Eval16Fn)
}
}
@@ -442,7 +449,7 @@ void TransformOnePixelWithGamutCheck(_cmsTRANSFORM* p,
p ->Lut ->Eval16Fn(wIn, wOut, p -> Lut->Data);
}
-// Gamut check, No cach�, 16 bits.
+// Gamut check, No cach�, 16 bits.
static
void PrecalculatedXFORMGamutCheck(_cmsTRANSFORM* p,
const void* in,
@@ -481,7 +488,7 @@ void PrecalculatedXFORMGamutCheck(_cmsTRANSFORM* p,
}
-// No gamut check, Cach�, 16 bits,
+// No gamut check, Cach�, 16 bits,
static
void CachedXFORM(_cmsTRANSFORM* p,
const void* in,
@@ -839,7 +846,7 @@ _cmsTRANSFORM* AllocEmptyTransform(cmsContext ContextID, cmsPipeline* lut,
p ->xform = NullFloatXFORM;
}
else {
- // Float transforms don't use cach�, always are non-NULL
+ // Float transforms don't use cach�, always are non-NULL
p ->xform = FloatXFORM;
}
@@ -878,16 +885,16 @@ _cmsTRANSFORM* AllocEmptyTransform(cmsContext ContextID, cmsPipeline* lut,
if (*dwFlags & cmsFLAGS_NOCACHE) {
if (*dwFlags & cmsFLAGS_GAMUTCHECK)
- p ->xform = PrecalculatedXFORMGamutCheck; // Gamut check, no cach�
+ p ->xform = PrecalculatedXFORMGamutCheck; // Gamut check, no cach�
else
- p ->xform = PrecalculatedXFORM; // No cach�, no gamut check
+ p ->xform = PrecalculatedXFORM; // No cach�, no gamut check
}
else {
if (*dwFlags & cmsFLAGS_GAMUTCHECK)
- p ->xform = CachedXFORMGamutCheck; // Gamut check, cach�
+ p ->xform = CachedXFORMGamutCheck; // Gamut check, cach�
else
- p ->xform = CachedXFORM; // No gamut check, cach�
+ p ->xform = CachedXFORM; // No gamut check, cach�
}
}
diff --git a/src/lcms2_internal.h b/src/lcms2_internal.h
index 5f5270c..a0183b8 100644
--- a/src/lcms2_internal.h
+++ b/src/lcms2_internal.h
@@ -875,6 +875,14 @@ struct _cmsStage_struct {
struct _cmsStage_struct* Next;
};
+// For templating
+void _LUTeval16(register const cmsUInt16Number In[], register cmsUInt16Number Out[], register const void* D);
+void EvaluateLab2XYZ(const cmsFloat32Number In[],
+ cmsFloat32Number Out[],
+ const cmsStage *mpe);
+void EvaluateMatrix(const cmsFloat32Number In[],
+ cmsFloat32Number Out[],
+ const cmsStage *mpe);
// Special Stages (cannot be saved)
CMSCHECKPOINT cmsStage* CMSEXPORT _cmsStageAllocLab2XYZ(cmsContext ContextID);
@@ -1106,6 +1114,145 @@ cmsBool _cmsAdaptationMatrix(cmsMAT3* r, const cmsMAT3* ConeMatrix, const cmsC
cmsBool _cmsBuildRGB2XYZtransferMatrix(cmsMAT3* r, const cmsCIExyY* WhitePoint, const cmsCIExyYTRIPLE* Primaries);
+// Conversion functions. From floating point to 16 bits
+inline __attribute__((always_inline))
+void FromFloatTo16(const cmsFloat32Number In[], cmsUInt16Number Out[], cmsUInt32Number n)
+{
+ cmsUInt32Number i;
+
+ for (i=0; i < n; i++) {
+ Out[i] = _cmsQuickSaturateWord(In[i] * 65535.0);
+ }
+}
+
+// From 16 bits to floating point
+inline __attribute__((always_inline))
+void From16ToFloat(const cmsUInt16Number In[], cmsFloat32Number Out[], cmsUInt32Number n)
+{
+ cmsUInt32Number i;
+
+ for (i=0; i < n; i++) {
+ Out[i] = (cmsFloat32Number) In[i] / 65535.0F;
+ }
+}
+
+inline __attribute__((always_inline))
+cmsFloat64Number f_1Inline(cmsFloat64Number t)
+{
+ const cmsFloat64Number Limit = (24.0/116.0);
+
+ if (t <= Limit) {
+ return (108.0/841.0) * (t - (16.0/116.0));
+ }
+
+ return t * t * t;
+}
+
+inline __attribute__((always_inline))
+void CMSEXPORT cmsLab2XYZInline(const cmsCIEXYZ* WhitePoint, cmsCIEXYZ* xyz, const cmsCIELab* Lab)
+{
+ cmsFloat64Number x, y, z;
+
+ if (WhitePoint == NULL)
+ WhitePoint = cmsD50_XYZ();
+
+ y = (Lab-> L + 16.0) / 116.0;
+ x = y + 0.002 * Lab -> a;
+ z = y - 0.005 * Lab -> b;
+
+ xyz -> X = f_1Inline(x) * WhitePoint -> X;
+ xyz -> Y = f_1Inline(y) * WhitePoint -> Y;
+ xyz -> Z = f_1Inline(z) * WhitePoint -> Z;
+
+}
+
+inline __attribute__((always_inline))
+void EvaluateLab2XYZInline(const cmsFloat32Number In[],
+ cmsFloat32Number Out[],
+ const cmsStage *mpe)
+{
+ cmsCIELab Lab;
+ cmsCIEXYZ XYZ;
+ const cmsFloat64Number XYZadj = MAX_ENCODEABLE_XYZ;
+
+ // V4 rules
+ Lab.L = In[0] * 100.0;
+ Lab.a = In[1] * 255.0 - 128.0;
+ Lab.b = In[2] * 255.0 - 128.0;
+
+ cmsLab2XYZInline(NULL, &XYZ, &Lab);
+
+ // From XYZ, range 0..19997 to 0..1.0, note that 1.99997 comes from 0xffff
+ // encoded as 1.15 fixed point, so 1 + (32767.0 / 32768.0)
+
+ Out[0] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.X / XYZadj);
+ Out[1] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.Y / XYZadj);
+ Out[2] = (cmsFloat32Number) ((cmsFloat64Number) XYZ.Z / XYZadj);
+ return;
+
+ cmsUNUSED_PARAMETER(mpe);
+}
+
+inline __attribute__((always_inline))
+void EvaluateMatrixInline(const cmsFloat32Number In[],
+ cmsFloat32Number Out[],
+ const cmsStage *mpe)
+{
+ cmsUInt32Number i, j;
+ _cmsStageMatrixData* Data = (_cmsStageMatrixData*) mpe ->Data;
+ cmsFloat64Number Tmp;
+
+ // Input is already in 0..1.0 notation
+ for (i=0; i < mpe ->OutputChannels; i++) {
+
+ Tmp = 0;
+ for (j=0; j < mpe->InputChannels; j++) {
+ Tmp += In[j] * Data->Double[i*mpe->InputChannels + j];
+ }
+
+ if (Data ->Offset != NULL)
+ Tmp += Data->Offset[i];
+
+ Out[i] = (cmsFloat32Number) Tmp;
+ }
+
+
+ // Output in 0..1.0 domain
+}
+
+inline __attribute__((always_inline))
+void _LUTeval16Inline(
+ register const cmsUInt16Number In[],
+ register cmsUInt16Number Out[],
+ register const void* D)
+{
+ cmsPipeline* lut = (cmsPipeline*) D;
+ cmsStage *mpe;
+ cmsFloat32Number Storage[2][MAX_STAGE_CHANNELS];
+ int Phase = 0, NextPhase;
+
+ From16ToFloat(In, &Storage[Phase][0], lut ->InputChannels);
+
+ for (mpe = lut->Elements;
+ mpe != NULL;
+ mpe = mpe->Next) {
+
+ NextPhase = Phase ^ 1;
+ if (mpe->EvalPtr == &EvaluateLab2XYZ) {
+ EvaluateLab2XYZInline(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
+ } else if (mpe->EvalPtr == &EvaluateMatrix) {
+ EvaluateMatrixInline(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
+ } else {
+ mpe->EvalPtr(&Storage[Phase][0], &Storage[NextPhase][0], mpe);
+ }
+ Phase = NextPhase;
+ }
+
+
+ FromFloatTo16(&Storage[Phase][0], Out, lut->OutputChannels);
+}
+
+
#define _lcms_internal_H
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment