rygorous/gist:2246678

## gistfile1.txt
// float->sRGB8 conversions - two variants.
// by Fabian "ryg" Giesen
//
// I hereby place this code in the public domain.
//
// Both variants come with absolute error bounds and a reversibility and monotonicity
// guarantee. They should pass D3D10 conformance testing.
//
// This is an ISPC port of https://gist.github.com/2203834 - see there for a test
// driver and code that computes the tables.

static inline int select(int a, int b, int mask)
{
    return mask ? a : b;
}

// This is the version that tries to use a small table (4 cache lines at 64 bytes/line)
// at the expense of a few extra instructions. Use "var2" below for a version with
// less instructions that uses a somewhat larger table.
//
// Float is semi-logarithmic.
// Linear x->sRGB for x >= 0.0031308 is (mostly) a power function which we
// approximate with a bunch of linear segments based on exponent and 3 highest
// bits of mantissa (2 was too inaccurate).
//
// Which exponents do we care about?
// Exponent >= 0: value was >=1, so we return 255 (in fact, we min with 1.0f-eps, so this never happens anyway).
// Exponent < -9: x < 1/512 which is well into the linear part of the sRGB mapping function.
// So the interesting exponent range is [-9,-1].
//
// To get a pow2-sized range, we cheat a bit and only store anchors for linear segments in
// the exponent range [-8,-1], using linear sRGB part of the formula until 1/256.
// This means that we treat a small part of the nonlinear range (namely, the interval
// [0.0031308,0.00390625]) as linear. Our linear scale value needs to be adjusted for this.
// This is done simply by starting from the "correct" scale value (255*12.92, 0x454de99a)
// and doing a binary search for the value that gives the best results (=lowest max error
// in this case) across the range we care about.
//
// The table itself has a bias in the top 16 bits and a scale factor for the linear function
// (based on the next 8 mantissa bits after the 3 we already used). Both are scaled to make
// good use of the available bits. The format was chosen this way so the linear function
// can be evaluated with a single PMADDWD after the mantissa bits were extracted - okay, we do
// need to insert one more set bit in the high half for the bias part to work.
// These coefficients were determined simply by doing a least-squares fit of a linear function
// f(x) = a+b*x inside each "bucket" (see table-making functions below).
//
// Max error for whole function (integer-rounded result minus "exact" value, as computed in
// floats using the official formula): 0.573277 at 0x3b7a88c6
int float_to_srgb8_var1(float in)
{
    static const uniform unsigned int table[64] = {
        0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
        0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
        0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
        0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
        0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
        0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
        0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
        0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
    };

    static const uniform unsigned int almost_one = 0x3f7fffff;
    static const uniform unsigned int linear_scale = 0x454c5d00;
    static const uniform unsigned int float2int_magic = 0x4b000000;

    // Clamp to [0, 1-eps]; these two values map to 0 and 1, respectively.
    in = max(in, 0.0f);
    in = min(in, floatbits(almost_one));

    // Linear case
    float flinear = in * linear_scale;
    int ilinear = intbits(in + floatbits(float2int_magic)) & 0xff;

    // Non-linear case
    // Unpack bias, scale from table
    unsigned int tab = table[(intbits(in) >> 20) & 63];
    unsigned int bias = (tab >> 16) << 9;
    unsigned int scale = tab & 0xffff;

    // Grab next-highest mantissa bits and perform linear interpolation
    unsigned int t = (intbits(in) >> 12) & 0xff;
    int itable = (bias + scale*t) >> 16;

    return select(itable, ilinear, in < 1.0f / 256.0f);
}

// This version uses a larger table than the code above (104 entries at 4 bytes each,
// or 6.5 cache lines at 64b/line) but is conceptually simpler and needs less instructions.
//
// The basic ideas are still the same, only this time, we squeeze everything into the
// table, even the linear part of the range; since we are approximating the function as
// piecewise linear anyway, this is fairly easy.
//
// In the exact version of the conversion, any value that produces an output float less
// than 0.5 will be rounded to an integer of zero. Inverting the linear part of the transform,
// we get:
//
//   log2(0.5 / (255 * 12.92)) =~ -12.686
//
// which in turn means that any value smaller than about 2^(-12.687) will return 0.
// What this means is that we can adapt the clamping code to just clamp to
// [2^(-13), 1-eps] and we're covered. This means our table needs to cover a range of
// 13 different exponents from -13 to -1.
//
// The table lookup, storage and interpolation works exactly the same way as in the code
// above.
//
// Max error for the whole function (integer-rounded result minus "exact" value, as computed in
// floats using the official formula): 0.544403 at 0x3e9f8000
int float_to_srgb8_var2(float in)
{
    static const uniform unsigned int table[104] = {
        0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
        0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
        0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
        0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
        0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
        0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
        0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
        0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
        0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
        0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
        0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
        0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
        0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
    };

    static const uniform unsigned int almost_one = 0x3f7fffff;

    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
    in = max(in, 0.0f);
    in = min(in, floatbits(almost_one));

    // Do the table lookup and unpack bias, scale
    unsigned int tab = table[(intbits(in) - 0x39000000u) >> 20];
    unsigned int bias = (tab >> 16) << 9;
    unsigned int scale = tab & 0xffff;

    // Grab next-highest mantissa bits and perform linear interpolation
    unsigned int t = (intbits(in) >> 12) & 0xff;
    return (bias + scale*t) >> 16;
}
	// float->sRGB8 conversions - two variants.
	// by Fabian "ryg" Giesen
	//
	// I hereby place this code in the public domain.
	//
	// Both variants come with absolute error bounds and a reversibility and monotonicity
	// guarantee. They should pass D3D10 conformance testing.
	//
	// This is an ISPC port of https://gist.github.com/2203834 - see there for a test
	// driver and code that computes the tables.

	static inline int select(int a, int b, int mask)
	{
	return mask ? a : b;
	}

	// This is the version that tries to use a small table (4 cache lines at 64 bytes/line)
	// at the expense of a few extra instructions. Use "var2" below for a version with
	// less instructions that uses a somewhat larger table.
	//
	// Float is semi-logarithmic.
	// Linear x->sRGB for x >= 0.0031308 is (mostly) a power function which we
	// approximate with a bunch of linear segments based on exponent and 3 highest
	// bits of mantissa (2 was too inaccurate).
	//
	// Which exponents do we care about?
	// Exponent >= 0: value was >=1, so we return 255 (in fact, we min with 1.0f-eps, so this never happens anyway).
	// Exponent < -9: x < 1/512 which is well into the linear part of the sRGB mapping function.
	// So the interesting exponent range is [-9,-1].
	//
	// To get a pow2-sized range, we cheat a bit and only store anchors for linear segments in
	// the exponent range [-8,-1], using linear sRGB part of the formula until 1/256.
	// This means that we treat a small part of the nonlinear range (namely, the interval
	// [0.0031308,0.00390625]) as linear. Our linear scale value needs to be adjusted for this.
	// This is done simply by starting from the "correct" scale value (255*12.92, 0x454de99a)
	// and doing a binary search for the value that gives the best results (=lowest max error
	// in this case) across the range we care about.
	//
	// The table itself has a bias in the top 16 bits and a scale factor for the linear function
	// (based on the next 8 mantissa bits after the 3 we already used). Both are scaled to make
	// good use of the available bits. The format was chosen this way so the linear function
	// can be evaluated with a single PMADDWD after the mantissa bits were extracted - okay, we do
	// need to insert one more set bit in the high half for the bias part to work.
	// These coefficients were determined simply by doing a least-squares fit of a linear function
	// f(x) = a+b*x inside each "bucket" (see table-making functions below).
	//
	// Max error for whole function (integer-rounded result minus "exact" value, as computed in
	// floats using the official formula): 0.573277 at 0x3b7a88c6
	int float_to_srgb8_var1(float in)
	{
	static const uniform unsigned int table[64] = {
	0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
	0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
	0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
	0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
	0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
	0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
	0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
	0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
	};

	static const uniform unsigned int almost_one = 0x3f7fffff;
	static const uniform unsigned int linear_scale = 0x454c5d00;
	static const uniform unsigned int float2int_magic = 0x4b000000;

	// Clamp to [0, 1-eps]; these two values map to 0 and 1, respectively.
	in = max(in, 0.0f);
	in = min(in, floatbits(almost_one));

	// Linear case
	float flinear = in * linear_scale;
	int ilinear = intbits(in + floatbits(float2int_magic)) & 0xff;

	// Non-linear case
	// Unpack bias, scale from table
	unsigned int tab = table[(intbits(in) >> 20) & 63];
	unsigned int bias = (tab >> 16) << 9;
	unsigned int scale = tab & 0xffff;

	// Grab next-highest mantissa bits and perform linear interpolation
	unsigned int t = (intbits(in) >> 12) & 0xff;
	int itable = (bias + scale*t) >> 16;

	return select(itable, ilinear, in < 1.0f / 256.0f);
	}

	// This version uses a larger table than the code above (104 entries at 4 bytes each,
	// or 6.5 cache lines at 64b/line) but is conceptually simpler and needs less instructions.
	//
	// The basic ideas are still the same, only this time, we squeeze everything into the
	// table, even the linear part of the range; since we are approximating the function as
	// piecewise linear anyway, this is fairly easy.
	//
	// In the exact version of the conversion, any value that produces an output float less
	// than 0.5 will be rounded to an integer of zero. Inverting the linear part of the transform,
	// we get:
	//
	// log2(0.5 / (255 * 12.92)) =~ -12.686
	//
	// which in turn means that any value smaller than about 2^(-12.687) will return 0.
	// What this means is that we can adapt the clamping code to just clamp to
	// [2^(-13), 1-eps] and we're covered. This means our table needs to cover a range of
	// 13 different exponents from -13 to -1.
	//
	// The table lookup, storage and interpolation works exactly the same way as in the code
	// above.
	//
	// Max error for the whole function (integer-rounded result minus "exact" value, as computed in
	// floats using the official formula): 0.544403 at 0x3e9f8000
	int float_to_srgb8_var2(float in)
	{
	static const uniform unsigned int table[104] = {
	0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
	0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
	0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
	0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
	0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
	0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
	0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
	0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
	0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
	0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
	0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
	0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
	0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
	};

	static const uniform unsigned int almost_one = 0x3f7fffff;

	// Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
	in = max(in, 0.0f);
	in = min(in, floatbits(almost_one));

	// Do the table lookup and unpack bias, scale
	unsigned int tab = table[(intbits(in) - 0x39000000u) >> 20];
	unsigned int bias = (tab >> 16) << 9;
	unsigned int scale = tab & 0xffff;

	// Grab next-highest mantissa bits and perform linear interpolation
	unsigned int t = (intbits(in) >> 12) & 0xff;
	return (bias + scale*t) >> 16;
	}