Skip to content

Instantly share code, notes, and snippets.

@mntone
Created April 20, 2017 01:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mntone/c3a1d921f137fef8dd292121ec0c3c59 to your computer and use it in GitHub Desktop.
Save mntone/c3a1d921f137fef8dd292121ec0c3c59 to your computer and use it in GitHub Desktop.
; Function compile flags: /Ogtpy
; File
; COMDAT decompress_r10l
_TEXT SEGMENT
input$ = 32
in_linesize$ = 40
start_y$ = 48
end_y$ = 56
output$ = 64
out_linesize$ = 72
decompress_r10l PROC ; COMDAT
; 549 : {
$LN20:
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
sub rsp, 16
; 94 : return a < b ? a : b;
mov esi, DWORD PTR out_linesize$[rsp]
; 549 : {
mov eax, r9d
; 551 : uint32_t y;
; 552 :
; 553 : const __m128i base = _mm_set1_epi32(0xC0000000);
movdqa xmm3, XMMWORD PTR __xmm@c0000000c0000000c0000000c0000000
; 94 : return a < b ? a : b;
cmp edx, esi
; 554 : const __m128i mask_r = _mm_set1_epi32(0xFFC00000);
movdqa xmm4, XMMWORD PTR __xmm@ffc00000ffc00000ffc00000ffc00000
; 94 : return a < b ? a : b;
mov r11d, esi
; 555 : const __m128i mask_g = _mm_set1_epi32(0x003FF000);
movdqa xmm5, XMMWORD PTR __xmm@003ff000003ff000003ff000003ff000
; 94 : return a < b ? a : b;
cmovb r11d, edx
; 550 : uint32_t width_d4 = min_uint32(in_linesize, out_linesize) / 16;
shr r11d, 4
mov r10d, r8d
movaps XMMWORD PTR [rsp], xmm6
mov edi, edx
; 556 : const __m128i mask_b = _mm_set1_epi32(0x00000FFC);
movdqa xmm6, XMMWORD PTR __xmm@00000ffc00000ffc00000ffc00000ffc
mov r14, rcx
; 557 :
; 558 : for (y = start_y; y < end_y; y++) {
cmp r8d, r9d
jae $LN3@decompress
mov QWORD PTR [rsp+32], rbx
; 94 : return a < b ? a : b;
mov r9d, edx
imul r9d, r10d
imul r8d, esi
sub eax, r10d
mov QWORD PTR [rsp+40], rbp
mov rbp, QWORD PTR output$[rsp]
mov ebx, eax
npad 8
$LL4@decompress:
; 559 : const __m128i *input0;
; 560 : register __m128i *output0;
; 561 : uint32_t x;
; 562 :
; 563 : input0 = (const __m128i*)(input + y * in_linesize);
mov edx, r9d
; 564 : output0 = (__m128i*)(output + y * out_linesize);
mov eax, r8d
add rdx, r14
add rax, rbp
; 565 :
; 566 : for (x = 0; x < width_d4; x++) {
test r11d, r11d
je SHORT $LN2@decompress
; 559 : const __m128i *input0;
; 560 : register __m128i *output0;
; 561 : uint32_t x;
; 562 :
; 563 : input0 = (const __m128i*)(input + y * in_linesize);
mov r10d, r11d
npad 12
$LL7@decompress:
; 567 : const __m128i tmp = _mm_load_si128(input0++);
mov rcx, rdx
add rdx, 16
movdqa xmm2, XMMWORD PTR [rcx]
; 568 : const __m128i r = _MM_AND_SRLI_EPI32(tmp, mask_r, 22);
; 569 : const __m128i g = _MM_AND_SRLI_EPI32(tmp, mask_g, 2);
; 570 : const __m128i b = _MM_AND_SLLI_EPI32(tmp, mask_b, 18);
; 571 :
; 572 : _mm_stream_si128(output0++,
mov rcx, rax
movdqa xmm1, xmm2
movdqa xmm0, xmm2
pand xmm1, xmm4
pand xmm0, xmm5
psrld xmm1, 22
pand xmm2, xmm6
por xmm1, xmm3
psrld xmm0, 2
por xmm1, xmm0
pslld xmm2, 18
add rax, 16
por xmm1, xmm2
movntdq XMMWORD PTR [rcx], xmm1
sub r10, 1
jne SHORT $LL7@decompress
$LN2@decompress:
; 557 :
; 558 : for (y = start_y; y < end_y; y++) {
add r9d, edi
add r8d, esi
sub rbx, 1
jne SHORT $LL4@decompress
mov rbp, QWORD PTR [rsp+40]
mov rbx, QWORD PTR [rsp+32]
$LN3@decompress:
; 573 : _MM_OR_SI128_OP4(base, r, g, b));
; 574 : }
; 575 : }
; 576 : }
mov rsi, QWORD PTR [rsp+48]
mov rdi, QWORD PTR [rsp+56]
movaps xmm6, XMMWORD PTR [rsp]
add rsp, 16
pop r14
ret 0
decompress_r10l ENDP
_TEXT ENDS
#include <xmmintrin.h>
#include <emmintrin.h>
#define _MM_OR_SI128_OP3(x, y, z) _mm_or_si128(_mm_or_si128(x, y), z)
#define _MM_OR_SI128_OP4(x, y, z, w) _mm_or_si128(_MM_OR_SI128_OP3(x, y, z), w)
#define _MM_AND_SLLI_EPI32(x, m, i) _mm_slli_epi32(_mm_and_si128(x, m), i)
#define _MM_AND_SRLI_EPI32(x, m, i) _mm_srli_epi32(_mm_and_si128(x, m), i)
void decompress_r10l(
const uint8_t *input, const uint32_t in_linesize,
uint32_t start_y, uint32_t end_y,
uint8_t *output, uint32_t out_linesize)
{
uint32_t width_d4 = min_uint32(in_linesize, out_linesize) / 16;
uint32_t y;
const __m128i base = _mm_set1_epi32(0xC0000000);
const __m128i mask_r = _mm_set1_epi32(0xFFC00000);
const __m128i mask_g = _mm_set1_epi32(0x003FF000);
const __m128i mask_b = _mm_set1_epi32(0x00000FFC);
for (y = start_y; y < end_y; y++) {
const __m128i *input0;
register __m128i *output0;
uint32_t x;
input0 = (const __m128i*)(input + y * in_linesize);
output0 = (__m128i*)(output + y * out_linesize);
for (x = 0; x < width_d4; x++) {
const __m128i tmp = _mm_load_si128(input0++);
const __m128i r = _MM_AND_SRLI_EPI32(tmp, mask_r, 22);
const __m128i g = _MM_AND_SRLI_EPI32(tmp, mask_g, 2);
const __m128i b = _MM_AND_SLLI_EPI32(tmp, mask_b, 18);
_mm_stream_si128(output0++,
_MM_OR_SI128_OP4(base, r, g, b));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment