p0nce/colorcorrection.d

## colorcorrection.d
void applyColorCorrection(ImageRef!RGBA image, const(ubyte*) rgbTable) pure nothrow @nogc
{
    int w = image.w;
    int h = image.h;
    for (int j = 0; j < h; ++j)
    {
        ubyte* scan = cast(ubyte*)(image.scanline(j).ptr);
/+
        static if (false)//(LDC)
        {
            import inteli.emmintrin;

            __m128i zero = _mm_setzero_si128();

            for (int i = 0; i < w; ++i)
            {
                __m128i indices = _mm_loadu_si32(&scan[i]);
                indices = _mm_unpacklo_epi8(indices, zero);
                indices = _mm_unpacklo_epi16(indices, zero);

                __m128i red   = _mm_loadu_si32( rgbTable + indices[0] );
                __m128i green = _mm_loadu_si32( rgbTable + indices[1] + 256);
                __m128i blue  = _mm_loadu_si32( rgbTable + indices[2] + 512);
                __m128i alpha = _mm_srli_si128!12(indices);

                // We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
                __m128i res = _mm_unpacklo_epi16(_mm_unpacklo_epi8(red, green), _mm_unpacklo_epi8(blue, alpha));
                *cast(int*)(&scan[i]) = res[0];
            }
        }
        else
        {
            for (int i = 0; i < w; ++i)
            {
                ubyte r = scan[4*i];
                ubyte g = scan[4*i+1];
                ubyte b = scan[4*i+2];
                scan[4*i]   = rgbTable[r];
                scan[4*i+1] = rgbTable[g+256];
                scan[4*i+2] = rgbTable[b+512];

            }
        }+/

        // We use the "shift-technique" from "Intel 64 and IA-32 Architectures
        // Optimization Reference Manual", Example 5-44
        // It reads 4 indices at once, because this loop is specially nasty


        version(D_InlineAsm_X86_64)
        {
            asm pure nothrow @nogc
            {
                mov ECX, w;
                mov RSI, scan;
                mov RDI, rgbTable;

                pxor XMM1, XMM1;

                loop:
                movd XMM0, [RSI]; // read 4 indices
                add RSI, 4;
                punpcklbw XMM0, XMM1; // expand indices to 16-bit
                punpcklwd XMM0, XMM1; // expand indices to 32-bit
                movd EAX, XMM0; // get first index (red)
                movd XMM2, [RDI + RAX]; // trick: we get 4 excess bytes each time

                psrldq  XMM0, 4;
                movd EAX, XMM0; // get 2nd index (green)
                movd XMM3, [RDI + RAX + 256];

                psrldq  XMM0, 4;
                movd EAX, XMM0; // get 3rd index (blue)
                movd XMM4, [RDI + RAX + 512]; // here we might overshoot, so the table need 3 excess bytes

                psrldq  XMM0, 4; // alpha can stay the same, no look-up

                // We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
                punpcklbw XMM2, XMM3;
                punpcklbw XMM4, XMM0;
                punpcklwd XMM2, XMM4;
                movd [RSI-4], XMM2;
                dec ECX;
                jnz loop;
            }
        }
        else version(D_InlineAsm_X86)
        {
            asm pure nothrow @nogc
            {
                mov ECX, w;
                mov ESI, scan;
                mov EDI, rgbTable;

                pxor XMM1, XMM1;

                loop:
                movd XMM0, [ESI]; // read 4 indices
                add ESI, 4;
                punpcklbw XMM0, XMM1; // expand indices to 16-bit
                punpcklwd XMM0, XMM1; // expand indices to 32-bit
                movd EAX, XMM0; // get first index (red)
                movd XMM2, [EDI + EAX]; // trick: we get 4 excess bytes each time

                psrldq  XMM0, 4;
                movd EAX, XMM0; // get 2nd index (green)
                movd XMM3, [EDI + EAX + 256];

                psrldq  XMM0, 4;
                movd EAX, XMM0; // get 3rd index (blue)
                movd XMM4, [EDI + EAX + 512]; // here we might overshoot, so the table need 3 excess bytes

                psrldq  XMM0, 4; // alpha can stay the same, no look-up

                // We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
                punpcklbw XMM2, XMM3;
                punpcklbw XMM4, XMM0;
                punpcklwd XMM2, XMM4;
                movd [ESI-4], XMM2;
                dec ECX;
                jnz loop;
            }
        }

    }
}
	void applyColorCorrection(ImageRef!RGBA image, const(ubyte*) rgbTable) pure nothrow @nogc
	{
	int w = image.w;
	int h = image.h;
	for (int j = 0; j < h; ++j)
	{
	ubyte* scan = cast(ubyte*)(image.scanline(j).ptr);
	/+
	static if (false)//(LDC)
	{
	import inteli.emmintrin;

	__m128i zero = _mm_setzero_si128();

	for (int i = 0; i < w; ++i)
	{
	__m128i indices = _mm_loadu_si32(&scan[i]);
	indices = _mm_unpacklo_epi8(indices, zero);
	indices = _mm_unpacklo_epi16(indices, zero);

	__m128i red = _mm_loadu_si32( rgbTable + indices[0] );
	__m128i green = _mm_loadu_si32( rgbTable + indices[1] + 256);
	__m128i blue = _mm_loadu_si32( rgbTable + indices[2] + 512);
	__m128i alpha = _mm_srli_si128!12(indices);

	// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
	__m128i res = _mm_unpacklo_epi16(_mm_unpacklo_epi8(red, green), _mm_unpacklo_epi8(blue, alpha));
	cast(int)(&scan[i]) = res[0];
	}
	}
	else
	{
	for (int i = 0; i < w; ++i)
	{
	ubyte r = scan[4*i];
	ubyte g = scan[4*i+1];
	ubyte b = scan[4*i+2];
	scan[4*i] = rgbTable[r];
	scan[4*i+1] = rgbTable[g+256];
	scan[4*i+2] = rgbTable[b+512];

	}
	}+/

	// We use the "shift-technique" from "Intel 64 and IA-32 Architectures
	// Optimization Reference Manual", Example 5-44
	// It reads 4 indices at once, because this loop is specially nasty



	version(D_InlineAsm_X86_64)
	{
	asm pure nothrow @nogc
	{
	mov ECX, w;
	mov RSI, scan;
	mov RDI, rgbTable;

	pxor XMM1, XMM1;

	loop:
	movd XMM0, [RSI]; // read 4 indices
	add RSI, 4;
	punpcklbw XMM0, XMM1; // expand indices to 16-bit
	punpcklwd XMM0, XMM1; // expand indices to 32-bit
	movd EAX, XMM0; // get first index (red)
	movd XMM2, [RDI + RAX]; // trick: we get 4 excess bytes each time

	psrldq XMM0, 4;
	movd EAX, XMM0; // get 2nd index (green)
	movd XMM3, [RDI + RAX + 256];

	psrldq XMM0, 4;
	movd EAX, XMM0; // get 3rd index (blue)
	movd XMM4, [RDI + RAX + 512]; // here we might overshoot, so the table need 3 excess bytes

	psrldq XMM0, 4; // alpha can stay the same, no look-up

	// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
	punpcklbw XMM2, XMM3;
	punpcklbw XMM4, XMM0;
	punpcklwd XMM2, XMM4;
	movd [RSI-4], XMM2;
	dec ECX;
	jnz loop;
	}
	}
	else version(D_InlineAsm_X86)
	{
	asm pure nothrow @nogc
	{
	mov ECX, w;
	mov ESI, scan;
	mov EDI, rgbTable;

	pxor XMM1, XMM1;

	loop:
	movd XMM0, [ESI]; // read 4 indices
	add ESI, 4;
	punpcklbw XMM0, XMM1; // expand indices to 16-bit
	punpcklwd XMM0, XMM1; // expand indices to 32-bit
	movd EAX, XMM0; // get first index (red)
	movd XMM2, [EDI + EAX]; // trick: we get 4 excess bytes each time

	psrldq XMM0, 4;
	movd EAX, XMM0; // get 2nd index (green)
	movd XMM3, [EDI + EAX + 256];

	psrldq XMM0, 4;
	movd EAX, XMM0; // get 3rd index (blue)
	movd XMM4, [EDI + EAX + 512]; // here we might overshoot, so the table need 3 excess bytes

	psrldq XMM0, 4; // alpha can stay the same, no look-up

	// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
	punpcklbw XMM2, XMM3;
	punpcklbw XMM4, XMM0;
	punpcklwd XMM2, XMM4;
	movd [ESI-4], XMM2;
	dec ECX;
	jnz loop;
	}
	}

	}
	}