Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
float->sRGB8 using SSE2 (and a table)
// float->sRGB8 conversions - two variants.
// by Fabian "ryg" Giesen
//
// I hereby place this code in the public domain.
//
// Both variants come with absolute error bounds and a reversibility and monotonicity
// guarantee (see test driver code below). They should pass D3D10 conformance testing
// (not that you can verify this, but still). They are verified against a clean reference
// implementation provided below, and the test driver checks all floats exhaustively.
//
// Variant 1 uses a smaller table (256 bytes) but a bit more code; variant 2 uses
// a 416-byte table and has simpler dataflow, so in terms of raw cycle count it should
// be faster, at the cost of a few more cache lines polluting L1. Both come in scalar
// and SSE2 variants. There's no total SSE2-isms there, so it should be reasonably easy
// to port to a different architecture. The biggest single part that would have to be
// replaced is the (admittedly weird) usage of PMADDWD (_mm_madd_epi16). The scale/bias
// computation can be done in other ways, but this one happened to map quite nicely to
// my requirements, so I used it.
//
// Generators for the tables are also included, for the curious. (Nothing up my sleeve!)
#include <stdio.h>
#include <math.h>
#include <emmintrin.h>
typedef unsigned int uint;
typedef unsigned char uint8;
union FP32
{
uint u;
float f;
struct
{
uint Mantissa : 23;
uint Exponent : 8;
uint Sign : 1;
};
};
// Returns "exact" float value. Round to nearest integer for conversion.
// Done this way so that conversion error can be estimated properly.
static float float_to_srgb8_ref(float f)
{
float s;
if (!(f > 0.0f)) // also covers NaNs
s = 0.0f;
else if (f <= 0.0031308f)
s = 12.92f * f;
else if (f < 1.0f)
s = 1.055f * pow(f, 1.0f / 2.4f) - 0.055f;
else
s = 1.0f;
return s * 255.0f;
}
static uint8 float_to_srgb8_ref_int(float f)
{
return (uint8) (float_to_srgb8_ref(f) + 0.5f); // round, then float->int
}
static float srgb8_to_float(uint8 val)
{
// There's just 256 different input valus - just use a table.
static float table[256];
static bool init;
if (!init)
{
init = true;
for (int i=0; i < 256; i++)
{
// Matches the conversion mandated by D3D10 spec
float c = (float) i * (1.0f / 255.0f);
if (c <= 0.04045f)
table[i] = c / 12.92f;
else
table[i] = pow((c + 0.055f) / 1.055f, 2.4f);
}
}
return table[val];
}
// This is the version that tries to use a small table (4 cache lines at 64 bytes/line)
// at the expense of a few extra instructions. Use "var2" below for a version with
// less instructions that uses a somewhat larger table.
//
// Float is semi-logarithmic.
// Linear x->sRGB for x >= 0.0031308 is (mostly) a power function which we
// approximate with a bunch of linear segments based on exponent and 3 highest
// bits of mantissa (2 was too inaccurate).
//
// Which exponents do we care about?
// Exponent >= 0: value was >=1, so we return 255 (in fact, we min with 1.0f-eps, so this never happens anyway).
// Exponent < -9: x < 1/512 which is well into the linear part of the sRGB mapping function.
// So the interesting exponent range is [-9,-1].
//
// To get a pow2-sized range, we cheat a bit and only store anchors for linear segments in
// the exponent range [-8,-1], using linear sRGB part of the formula until 1/256.
// This means that we treat a small part of the nonlinear range (namely, the interval
// [0.0031308,0.00390625]) as linear. Our linear scale value needs to be adjusted for this.
// This is done simply by starting from the "correct" scale value (255*12.92, 0x454de99a)
// and doing a binary search for the value that gives the best results (=lowest max error
// in this case) across the range we care about.
//
// The table itself has a bias in the top 16 bits and a scale factor for the linear function
// (based on the next 8 mantissa bits after the 3 we already used). Both are scaled to make
// good use of the available bits. The format was chosen this way so the linear function
// can be evaluated with a single PMADDWD after the mantissa bits were extracted - okay, we do
// need to insert one more set bit in the high half for the bias part to work.
// These coefficients were determined simply by doing a least-squares fit of a linear function
// f(x) = a+b*x inside each "bucket" (see table-making functions below).
//
// Max error for whole function (integer-rounded result minus "exact" value, as computed in
// floats using the official formula): 0.573277 at 0x3b7a88c6
static const uint fp32_to_srgb8_tab3[64] = {
0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
};
static uint8 float_to_srgb8(float in)
{
static const FP32 almostone = { 0x3f7fffff }; // 1-eps
static const FP32 lutthresh = { 0x3b800000 }; // 2^(-8)
static const FP32 linearsc = { 0x454c5d00 };
static const FP32 float2int = { (127 + 23) << 23 };
FP32 f;
// Clamp to [0, 1-eps]; these two values map to 0 and 1, respectively.
// The tests are carefully written so that NaNs map to 0, same as in the reference
// implementation.
if (!(in > 0.0f)) // written this way to catch NaNs
in = 0.0f;
if (in > almostone.f)
in = almostone.f;
// Check which region this value falls into
f.f = in;
if (f.f < lutthresh.f) // linear region
{
f.f *= linearsc.f;
f.f += float2int.f; // use "magic value" to get float->int with rounding.
return (uint8) (f.u & 255);
}
else // non-linear region
{
// Unpack bias, scale from table
uint tab = fp32_to_srgb8_tab3[(f.u >> 20) & 63];
uint bias = (tab >> 16) << 9;
uint scale = tab & 0xffff;
// Grab next-highest mantissa bits and perform linear interpolation
uint t = (f.u >> 12) & 0xff;
return (uint8) ((bias + scale*t) >> 16);
}
}
static __m128i float_to_srgb8_SSE2(__m128 f)
{
#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) }
#define CONST(name) *(const __m128i *)&name
#define CONSTF(name) *(const __m128 *)&name
SSE_CONST4(c_almostone, 0x3f7fffff);
SSE_CONST4(c_lutthresh, 0x3b800000);
SSE_CONST4(c_tabmask, 63);
SSE_CONST4(c_linearsc, 0x454c5d00);
SSE_CONST4(c_mantmask, 0xff);
SSE_CONST4(c_topscale, 0x02000000);
__m128i temp; // temp value (on stack)
// Initial clamp
__m128 zero = _mm_setzero_ps();
__m128 clamp1 = _mm_max_ps(f, zero); // limit to [0,1-eps] - also nukes NaNs
__m128 clamp2 = _mm_min_ps(clamp1, CONSTF(c_almostone));
// Table index
__m128i tabidx1 = _mm_srli_epi32(_mm_castps_si128(clamp2), 20);
__m128i tabidx2 = _mm_and_si128(tabidx1, CONST(c_tabmask));
_mm_store_si128(&temp, tabidx2);
// Table lookup
temp.m128i_u32[0] = fp32_to_srgb8_tab3[temp.m128i_u32[0]];
temp.m128i_u32[1] = fp32_to_srgb8_tab3[temp.m128i_u32[1]];
temp.m128i_u32[2] = fp32_to_srgb8_tab3[temp.m128i_u32[2]];
temp.m128i_u32[3] = fp32_to_srgb8_tab3[temp.m128i_u32[3]];
// Linear part of ramp
__m128 linear1 = _mm_mul_ps(clamp2, CONSTF(c_linearsc));
__m128i linear2 = _mm_cvtps_epi32(linear1);
// Table finisher
__m128i tabval = _mm_load_si128(&temp);
__m128i tabmult1 = _mm_srli_epi32(_mm_castps_si128(clamp2), 12);
__m128i tabmult2 = _mm_and_si128(tabmult1, CONST(c_mantmask));
__m128i tabmult3 = _mm_or_si128(tabmult2, CONST(c_topscale));
__m128i tabprod = _mm_madd_epi16(tabval, tabmult3);
__m128i tabshifted = _mm_srli_epi32(tabprod, 16);
// Combine linear+table
__m128 b_uselin = _mm_cmplt_ps(clamp2, CONSTF(c_lutthresh)); // use linear results
__m128i merge1 = _mm_and_si128(linear2, _mm_castps_si128(b_uselin));
__m128i merge2 = _mm_andnot_si128(_mm_castps_si128(b_uselin), tabshifted);
__m128i result = _mm_or_si128(merge1, merge2);
return result;
#undef SSE_CONST4
#undef CONST
#undef CONSTF
}
// This version uses a larger table than the code above (104 entries at 4 bytes each,
// or 6.5 cache lines at 64b/line) but is conceptually simpler and needs less instructions.
//
// The basic ideas are still the same, only this time, we squeeze everything into the
// table, even the linear part of the range; since we are approximating the function as
// piecewise linear anyway, this is fairly easy.
//
// In the exact version of the conversion, any value that produces an output float less
// than 0.5 will be rounded to an integer of zero. Inverting the linear part of the transform,
// we get:
//
// log2(0.5 / (255 * 12.92)) =~ -12.686
//
// which in turn means that any value smaller than about 2^(-12.687) will return 0.
// What this means is that we can adapt the clamping code to just clamp to
// [2^(-13), 1-eps] and we're covered. This means our table needs to cover a range of
// 13 different exponents from -13 to -1.
//
// The table lookup, storage and interpolation works exactly the same way as in the code
// above.
//
// Max error for the whole function (integer-rounded result minus "exact" value, as computed in
// floats using the official formula): 0.544403 at 0x3e9f8000
static const uint fp32_to_srgb8_tab4[104] = {
0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
};
static uint8 float_to_srgb8_var2(float in)
{
static const FP32 almostone = { 0x3f7fffff }; // 1-eps
static const FP32 minval = { (127-13) << 23 };
FP32 f;
// Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
// The tests are carefully written so that NaNs map to 0, same as in the reference
// implementation.
if (!(in > minval.f)) // written this way to catch NaNs
in = minval.f;
if (in > almostone.f)
in = almostone.f;
// Do the table lookup and unpack bias, scale
f.f = in;
uint tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
uint bias = (tab >> 16) << 9;
uint scale = tab & 0xffff;
// Grab next-highest mantissa bits and perform linear interpolation
uint t = (f.u >> 12) & 0xff;
return (uint8) ((bias + scale*t) >> 16);
}
static __m128i float_to_srgb8_var2_SSE2(__m128 f)
{
#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) }
#define CONST(name) *(const __m128i *)&name
#define CONSTF(name) *(const __m128 *)&name
SSE_CONST4(c_clampmin, (127 - 13) << 23);
SSE_CONST4(c_almostone, 0x3f7fffff);
SSE_CONST4(c_lutthresh, 0x3b800000);
SSE_CONST4(c_mantmask, 0xff);
SSE_CONST4(c_topscale, 0x02000000);
__m128i temp; // temp value (on stack)
// Initial clamp
__m128 clamp1 = _mm_max_ps(f, CONSTF(c_clampmin)); // limit to [clampmin,1-eps] - also nuke NaNs
__m128 clamp2 = _mm_min_ps(clamp1, CONSTF(c_almostone));
// Table index
__m128i tabidx = _mm_srli_epi32(_mm_castps_si128(clamp2), 20);
_mm_store_si128(&temp, tabidx);
// Table lookup
temp.m128i_u32[0] = fp32_to_srgb8_tab4[temp.m128i_i32[0] - (127-13)*8];
temp.m128i_u32[1] = fp32_to_srgb8_tab4[temp.m128i_i32[1] - (127-13)*8];
temp.m128i_u32[2] = fp32_to_srgb8_tab4[temp.m128i_i32[2] - (127-13)*8];
temp.m128i_u32[3] = fp32_to_srgb8_tab4[temp.m128i_i32[3] - (127-13)*8];
// Finisher
__m128i tabval = _mm_load_si128(&temp);
__m128i tabmult1 = _mm_srli_epi32(_mm_castps_si128(clamp2), 12);
__m128i tabmult2 = _mm_and_si128(tabmult1, CONST(c_mantmask));
__m128i tabmult3 = _mm_or_si128(tabmult2, CONST(c_topscale));
__m128i tabprod = _mm_madd_epi16(tabval, tabmult3);
__m128i result = _mm_srli_epi32(tabprod, 16);
return result;
#undef SSE_CONST4
#undef CONST
#undef CONSTF
}
// ----
//
// Table generation functions. These are not required to run the code; they're just
// here to show how the tables were computed.
//#define GENTABLES
#ifdef GENTABLES
static void print_table(const char *filename, const char *varname, const uint *table, int nelem)
{
FILE *file = fopen(filename, "w");
fprintf(file, "static const uint %s[%d] = {\n", varname, nelem);
for (int i=0; i < nelem; i++)
{
if ((i & 7) == 0)
fprintf(file, " ");
fprintf(file, " 0x%08x,", table[i]);
if ((i & 7) == 7)
fprintf(file, "\n");
}
if ((nelem & 7) != 0)
fprintf(file, "\n");
fprintf(file, "};\n");
fclose(file);
}
// Table-generation function for variant 1 above.
static void make_tab3()
{
static const int nbuckets = 64;
static const int bucketsize = 1 << 20;
static const int mantshift = 12;
FP32 f;
uint table[nbuckets];
double sum_aa = bucketsize;
double sum_ab = 0.0, sum_bb = 0.0;
for (int i=0; i < bucketsize; i++)
{
int j = i >> mantshift;
sum_ab += j;
sum_bb += j*j;
}
double inv_det = 1.0 / (sum_aa * sum_bb - sum_ab * sum_ab);
for (int bucket=0; bucket < nbuckets; bucket++)
{
int start = 0x3b800000 + bucket*bucketsize;
double sum_a = 0.0;
double sum_b = 0.0;
// model: a+b*i
for (int i=0; i<bucketsize; i++)
{
int j = i >> mantshift;
f.u = start + i;
float val = float_to_srgb8_ref(f.f) + 0.5f;
sum_a += val;
sum_b += j*val;
}
// solve
double solved_a = inv_det * (sum_bb*sum_a - sum_ab*sum_b);
double solved_b = inv_det * (sum_aa*sum_b - sum_ab*sum_a);
double scaled_a = solved_a * 65536.0 / 512.0;
double scaled_b = solved_b * 65536.0;
int int_a = (int) (scaled_a + 0.5f);
int int_b = (int) (scaled_b + 0.5f);
table[(start / bucketsize) & (nbuckets - 1)] = (int_a << 16) + int_b;
printf("%d\n", bucket);
}
print_table("tab.txt", "fp32_to_srgb8_tab3", table, nbuckets);
}
// Table-generation function for variant 2 above.
static void make_tab4()
{
static const int numexp = 13;
static const int mantissa_msb = 3;
static const int nbuckets = numexp << mantissa_msb;
static const int bucketsize = 1 << (23 - mantissa_msb);
static const int mantshift = 12;
FP32 f;
uint table[nbuckets];
double sum_aa = bucketsize;
double sum_ab = 0.0, sum_bb = 0.0;
for (int i=0; i < bucketsize; i++)
{
int j = i >> mantshift;
sum_ab += j;
sum_bb += j*j;
}
double inv_det = 1.0 / (sum_aa * sum_bb - sum_ab * sum_ab);
for (int bucket=0; bucket < nbuckets; bucket++)
{
int start = ((127 - numexp) << 23) + bucket*bucketsize;
double sum_a = 0.0;
double sum_b = 0.0;
// model: a+b*i
for (int i=0; i<bucketsize; i++)
{
int j = i >> mantshift;
f.u = start + i;
float val = float_to_srgb8_ref(f.f) + 0.5f;
sum_a += val;
sum_b += j*val;
}
// solve
double solved_a = inv_det * (sum_bb*sum_a - sum_ab*sum_b);
double solved_b = inv_det * (sum_aa*sum_b - sum_ab*sum_a);
double scaled_a = solved_a * 65536.0 / 512.0;
double scaled_b = solved_b * 65536.0;
int int_a = (int) (scaled_a + 0.5f);
int int_b = (int) (scaled_b + 0.5f);
table[bucket] = (int_a << 16) + int_b;
printf("%d\n", bucket);
}
print_table("tab4.txt", "fp32_to_srgb8_tab4", table, nbuckets);
}
#endif
int main()
{
#ifdef GENTABLES
// generate the tables
make_tab3();
make_tab4();
#endif
// First, verify that conversion round-trip works. This is an
// obvious and important constraint.
for (int i=0; i < 256; i++)
{
float f = srgb8_to_float(i);
int ref = float_to_srgb8_ref_int(f);
int var1 = float_to_srgb8(f);
int var2 = float_to_srgb8_var2(f);
if (ref != i || var1 != i || var2 != i)
{
printf("invertability broken! i=%d ref=%d var1=%d var2=%d\n", i, ref, var1, var2);
return 1;
}
}
// Loop over whole 32-bit range, checking whether error is within allowed bounds.
// At the same time, we also test whether float->sRGB8 conversion is monotonic.
// To make the latter easy, we traverse the range starting from the first "positive"
// NaN (which maps to 0), then going over all negative values, finally looping back
// to 0 and positive values. That way, we see the whole range in increasing order of
// return values.
static const int nvar = 2; // number of variants we're testing
static const float max_abs_err = 0.6f;
float maxerr[nvar] = { 0 };
uint maxerrat[nvar] = { 0 };
int prev[nvar] = { 0 };
uint start = (255 << 23) + 1; // first NaN
uint u = start;
printf("Scalar\n");
do
{
FP32 f;
int res[nvar];
f.u = u;
float ref_val = float_to_srgb8_ref(f.f);
res[0] = float_to_srgb8(f.f);
res[1] = float_to_srgb8_var2(f.f);
for (int i=0; i < nvar; i++)
{
float err = fabs(res[i] - ref_val);
if (err >= max_abs_err)
{
printf("err=%f at u=%08x for variant %d, must be less than %f!\n", err, u, i + 1, max_abs_err);
return 1;
}
if (err >= maxerr[i])
{
maxerr[i] = err;
maxerrat[i] = u;
}
if (res[i] < prev[i])
{
printf("monotonicity violated at u=%08x for variant %d! result=%d prev=%d\n", u, i + 1, res[i], prev[i]);
return 1;
}
prev[i] = res[i];
}
u++;
if ((u & 0xffffff) == 1)
printf(" %02x\n", u >> 24);
} while (u != start);
printf("SIMD\n");
do
{
__m128 ssein;
__m128i sseout[2];
__m128i ref[2];
for (uint j=0; j < 4; j++)
{
ssein.m128_u32[j] = u + j;
FP32 f;
f.u = u + j;
ref[0].m128i_u32[j] = float_to_srgb8(f.f);
ref[1].m128i_u32[j] = float_to_srgb8_var2(f.f);
}
sseout[0] = float_to_srgb8_SSE2(ssein);
sseout[1] = float_to_srgb8_var2_SSE2(ssein);
for (int i=0; i < nvar; i++)
{
for (uint j=0; j < 4; j++)
{
uint simd = sseout[i].m128i_u32[j];
uint scalar = ref[i].m128i_u32[j];
if (simd != scalar)
{
printf("SIMD/scalar mismatch at u=%08x for variant %d: scalar=%d, SIMD=%d\n", u + j, i + 1, scalar, simd);
return 1;
}
}
}
u += 4;
if ((u & 0xffffff) == 1)
printf(" %02x\n", u >> 24);
} while (u != start);
printf("\nAll done!\n\n");
for (int i=0; i < nvar; i++)
printf("variant %d: max error %f at 0x%08x\n", i+1, maxerr[i], maxerrat[i]);
return 0;
}
@Photosounder
Copy link

Photosounder commented May 23, 2019

I have another pretty fast but high precision way of converting from linear float to sRGB. after clamping the input I add an offset of 0.0031308f to the input value, create a loopup table index based on subtracting the binary forms of the input number+the offset minus the binary form of the offset, and at that index in a lookup table I have polynomial coefficients which give me the result by computing (a*x + b)*x + c. See https://github.com/Photosounder/rouziclib/blob/master/rouziclib/fastfloat/fastfloat.c#L390

Each entry of the lookup table gives 3 coefficients which make up a quadratic polynomial (like a parabolic curve) which fits a small segment of the function to approximate, hence why I get very high precision with only a 396 byte table.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment