Skip to content

Instantly share code, notes, and snippets.

@p0nce
Created February 16, 2019 22:13
Show Gist options
  • Save p0nce/0c75a0b93c95bd034a0485330f857024 to your computer and use it in GitHub Desktop.
Save p0nce/0c75a0b93c95bd034a0485330f857024 to your computer and use it in GitHub Desktop.
Failure to speed-up color correction in Dplug
void applyColorCorrection(ImageRef!RGBA image, const(ubyte*) rgbTable) pure nothrow @nogc
{
int w = image.w;
int h = image.h;
for (int j = 0; j < h; ++j)
{
ubyte* scan = cast(ubyte*)(image.scanline(j).ptr);
/+
static if (false)//(LDC)
{
import inteli.emmintrin;
__m128i zero = _mm_setzero_si128();
for (int i = 0; i < w; ++i)
{
__m128i indices = _mm_loadu_si32(&scan[i]);
indices = _mm_unpacklo_epi8(indices, zero);
indices = _mm_unpacklo_epi16(indices, zero);
__m128i red = _mm_loadu_si32( rgbTable + indices[0] );
__m128i green = _mm_loadu_si32( rgbTable + indices[1] + 256);
__m128i blue = _mm_loadu_si32( rgbTable + indices[2] + 512);
__m128i alpha = _mm_srli_si128!12(indices);
// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
__m128i res = _mm_unpacklo_epi16(_mm_unpacklo_epi8(red, green), _mm_unpacklo_epi8(blue, alpha));
*cast(int*)(&scan[i]) = res[0];
}
}
else
{
for (int i = 0; i < w; ++i)
{
ubyte r = scan[4*i];
ubyte g = scan[4*i+1];
ubyte b = scan[4*i+2];
scan[4*i] = rgbTable[r];
scan[4*i+1] = rgbTable[g+256];
scan[4*i+2] = rgbTable[b+512];
}
}+/
// We use the "shift-technique" from "Intel 64 and IA-32 Architectures
// Optimization Reference Manual", Example 5-44
// It reads 4 indices at once, because this loop is specially nasty
version(D_InlineAsm_X86_64)
{
asm pure nothrow @nogc
{
mov ECX, w;
mov RSI, scan;
mov RDI, rgbTable;
pxor XMM1, XMM1;
loop:
movd XMM0, [RSI]; // read 4 indices
add RSI, 4;
punpcklbw XMM0, XMM1; // expand indices to 16-bit
punpcklwd XMM0, XMM1; // expand indices to 32-bit
movd EAX, XMM0; // get first index (red)
movd XMM2, [RDI + RAX]; // trick: we get 4 excess bytes each time
psrldq XMM0, 4;
movd EAX, XMM0; // get 2nd index (green)
movd XMM3, [RDI + RAX + 256];
psrldq XMM0, 4;
movd EAX, XMM0; // get 3rd index (blue)
movd XMM4, [RDI + RAX + 512]; // here we might overshoot, so the table need 3 excess bytes
psrldq XMM0, 4; // alpha can stay the same, no look-up
// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
punpcklbw XMM2, XMM3;
punpcklbw XMM4, XMM0;
punpcklwd XMM2, XMM4;
movd [RSI-4], XMM2;
dec ECX;
jnz loop;
}
}
else version(D_InlineAsm_X86)
{
asm pure nothrow @nogc
{
mov ECX, w;
mov ESI, scan;
mov EDI, rgbTable;
pxor XMM1, XMM1;
loop:
movd XMM0, [ESI]; // read 4 indices
add ESI, 4;
punpcklbw XMM0, XMM1; // expand indices to 16-bit
punpcklwd XMM0, XMM1; // expand indices to 32-bit
movd EAX, XMM0; // get first index (red)
movd XMM2, [EDI + EAX]; // trick: we get 4 excess bytes each time
psrldq XMM0, 4;
movd EAX, XMM0; // get 2nd index (green)
movd XMM3, [EDI + EAX + 256];
psrldq XMM0, 4;
movd EAX, XMM0; // get 3rd index (blue)
movd XMM4, [EDI + EAX + 512]; // here we might overshoot, so the table need 3 excess bytes
psrldq XMM0, 4; // alpha can stay the same, no look-up
// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0
punpcklbw XMM2, XMM3;
punpcklbw XMM4, XMM0;
punpcklwd XMM2, XMM4;
movd [ESI-4], XMM2;
dec ECX;
jnz loop;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment