Created
February 16, 2019 22:13
-
-
Save p0nce/0c75a0b93c95bd034a0485330f857024 to your computer and use it in GitHub Desktop.
Failure to speed-up color correction in Dplug
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void applyColorCorrection(ImageRef!RGBA image, const(ubyte*) rgbTable) pure nothrow @nogc | |
{ | |
int w = image.w; | |
int h = image.h; | |
for (int j = 0; j < h; ++j) | |
{ | |
ubyte* scan = cast(ubyte*)(image.scanline(j).ptr); | |
/+ | |
static if (false)//(LDC) | |
{ | |
import inteli.emmintrin; | |
__m128i zero = _mm_setzero_si128(); | |
for (int i = 0; i < w; ++i) | |
{ | |
__m128i indices = _mm_loadu_si32(&scan[i]); | |
indices = _mm_unpacklo_epi8(indices, zero); | |
indices = _mm_unpacklo_epi16(indices, zero); | |
__m128i red = _mm_loadu_si32( rgbTable + indices[0] ); | |
__m128i green = _mm_loadu_si32( rgbTable + indices[1] + 256); | |
__m128i blue = _mm_loadu_si32( rgbTable + indices[2] + 512); | |
__m128i alpha = _mm_srli_si128!12(indices); | |
// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0 | |
__m128i res = _mm_unpacklo_epi16(_mm_unpacklo_epi8(red, green), _mm_unpacklo_epi8(blue, alpha)); | |
*cast(int*)(&scan[i]) = res[0]; | |
} | |
} | |
else | |
{ | |
for (int i = 0; i < w; ++i) | |
{ | |
ubyte r = scan[4*i]; | |
ubyte g = scan[4*i+1]; | |
ubyte b = scan[4*i+2]; | |
scan[4*i] = rgbTable[r]; | |
scan[4*i+1] = rgbTable[g+256]; | |
scan[4*i+2] = rgbTable[b+512]; | |
} | |
}+/ | |
// We use the "shift-technique" from "Intel 64 and IA-32 Architectures | |
// Optimization Reference Manual", Example 5-44 | |
// It reads 4 indices at once, because this loop is specially nasty | |
version(D_InlineAsm_X86_64) | |
{ | |
asm pure nothrow @nogc | |
{ | |
mov ECX, w; | |
mov RSI, scan; | |
mov RDI, rgbTable; | |
pxor XMM1, XMM1; | |
loop: | |
movd XMM0, [RSI]; // read 4 indices | |
add RSI, 4; | |
punpcklbw XMM0, XMM1; // expand indices to 16-bit | |
punpcklwd XMM0, XMM1; // expand indices to 32-bit | |
movd EAX, XMM0; // get first index (red) | |
movd XMM2, [RDI + RAX]; // trick: we get 4 excess bytes each time | |
psrldq XMM0, 4; | |
movd EAX, XMM0; // get 2nd index (green) | |
movd XMM3, [RDI + RAX + 256]; | |
psrldq XMM0, 4; | |
movd EAX, XMM0; // get 3rd index (blue) | |
movd XMM4, [RDI + RAX + 512]; // here we might overshoot, so the table need 3 excess bytes | |
psrldq XMM0, 4; // alpha can stay the same, no look-up | |
// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0 | |
punpcklbw XMM2, XMM3; | |
punpcklbw XMM4, XMM0; | |
punpcklwd XMM2, XMM4; | |
movd [RSI-4], XMM2; | |
dec ECX; | |
jnz loop; | |
} | |
} | |
else version(D_InlineAsm_X86) | |
{ | |
asm pure nothrow @nogc | |
{ | |
mov ECX, w; | |
mov ESI, scan; | |
mov EDI, rgbTable; | |
pxor XMM1, XMM1; | |
loop: | |
movd XMM0, [ESI]; // read 4 indices | |
add ESI, 4; | |
punpcklbw XMM0, XMM1; // expand indices to 16-bit | |
punpcklwd XMM0, XMM1; // expand indices to 32-bit | |
movd EAX, XMM0; // get first index (red) | |
movd XMM2, [EDI + EAX]; // trick: we get 4 excess bytes each time | |
psrldq XMM0, 4; | |
movd EAX, XMM0; // get 2nd index (green) | |
movd XMM3, [EDI + EAX + 256]; | |
psrldq XMM0, 4; | |
movd EAX, XMM0; // get 3rd index (blue) | |
movd XMM4, [EDI + EAX + 512]; // here we might overshoot, so the table need 3 excess bytes | |
psrldq XMM0, 4; // alpha can stay the same, no look-up | |
// We need to assemble: RGBAxxxxxxxxxxxxxxxx from lowest bytes of XMM2/XMM3/XMM4/XMM0 | |
punpcklbw XMM2, XMM3; | |
punpcklbw XMM4, XMM0; | |
punpcklwd XMM2, XMM4; | |
movd [ESI-4], XMM2; | |
dec ECX; | |
jnz loop; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment