mntone/decompress_r10l.asm

## decompress_r10l.asm
; Function compile flags: /Ogtpy
; File
;	COMDAT decompress_r10l
_TEXT	SEGMENT
input$ = 32
in_linesize$ = 40
start_y$ = 48
end_y$ = 56
output$ = 64
out_linesize$ = 72
decompress_r10l PROC					; COMDAT

; 549  : {

$LN20:
	mov	QWORD PTR [rsp+24], rsi
	mov	QWORD PTR [rsp+32], rdi
	push	r14
	sub	rsp, 16

; 94   : 	return a < b ? a : b;

	mov	esi, DWORD PTR out_linesize$[rsp]

; 549  : {

	mov	eax, r9d

; 551  : 	uint32_t y;
; 552  :
; 553  : 	const __m128i base   = _mm_set1_epi32(0xC0000000);

	movdqa	xmm3, XMMWORD PTR __xmm@c0000000c0000000c0000000c0000000

; 94   : 	return a < b ? a : b;

	cmp	edx, esi

; 554  : 	const __m128i mask_r = _mm_set1_epi32(0xFFC00000);

	movdqa	xmm4, XMMWORD PTR __xmm@ffc00000ffc00000ffc00000ffc00000

; 94   : 	return a < b ? a : b;

	mov	r11d, esi

; 555  : 	const __m128i mask_g = _mm_set1_epi32(0x003FF000);

	movdqa	xmm5, XMMWORD PTR __xmm@003ff000003ff000003ff000003ff000

; 94   : 	return a < b ? a : b;

	cmovb	r11d, edx

; 550  : 	uint32_t width_d4 = min_uint32(in_linesize, out_linesize) / 16;

	shr	r11d, 4
	mov	r10d, r8d
	movaps	XMMWORD PTR [rsp], xmm6
	mov	edi, edx

; 556  : 	const __m128i mask_b = _mm_set1_epi32(0x00000FFC);

	movdqa	xmm6, XMMWORD PTR __xmm@00000ffc00000ffc00000ffc00000ffc
	mov	r14, rcx

; 557  :
; 558  : 	for (y = start_y; y < end_y; y++) {

	cmp	r8d, r9d
	jae	$LN3@decompress
	mov	QWORD PTR [rsp+32], rbx

; 94   : 	return a < b ? a : b;

	mov	r9d, edx
	imul	r9d, r10d
	imul	r8d, esi
	sub	eax, r10d
	mov	QWORD PTR [rsp+40], rbp
	mov	rbp, QWORD PTR output$[rsp]
	mov	ebx, eax
	npad	8
$LL4@decompress:

; 559  : 		const __m128i *input0;
; 560  : 		register __m128i *output0;
; 561  : 		uint32_t x;
; 562  :
; 563  : 		input0 = (const __m128i*)(input + y * in_linesize);

	mov	edx, r9d

; 564  : 		output0 = (__m128i*)(output + y * out_linesize);

	mov	eax, r8d
	add	rdx, r14
	add	rax, rbp

; 565  :
; 566  : 		for (x = 0; x < width_d4; x++) {

	test	r11d, r11d
	je	SHORT $LN2@decompress

; 559  : 		const __m128i *input0;
; 560  : 		register __m128i *output0;
; 561  : 		uint32_t x;
; 562  :
; 563  : 		input0 = (const __m128i*)(input + y * in_linesize);

	mov	r10d, r11d
	npad	12
$LL7@decompress:

; 567  : 			const __m128i tmp = _mm_load_si128(input0++);

	mov	rcx, rdx
	add	rdx, 16
	movdqa	xmm2, XMMWORD PTR [rcx]

; 568  : 			const __m128i r = _MM_AND_SRLI_EPI32(tmp, mask_r, 22);
; 569  : 			const __m128i g = _MM_AND_SRLI_EPI32(tmp, mask_g,  2);
; 570  : 			const __m128i b = _MM_AND_SLLI_EPI32(tmp, mask_b, 18);
; 571  :
; 572  : 			_mm_stream_si128(output0++,

	mov	rcx, rax
	movdqa	xmm1, xmm2
	movdqa	xmm0, xmm2
	pand	xmm1, xmm4
	pand	xmm0, xmm5
	psrld	xmm1, 22
	pand	xmm2, xmm6
	por	xmm1, xmm3
	psrld	xmm0, 2
	por	xmm1, xmm0
	pslld	xmm2, 18
	add	rax, 16
	por	xmm1, xmm2
	movntdq	XMMWORD PTR [rcx], xmm1
	sub	r10, 1
	jne	SHORT $LL7@decompress
$LN2@decompress:

; 557  :
; 558  : 	for (y = start_y; y < end_y; y++) {

	add	r9d, edi
	add	r8d, esi
	sub	rbx, 1
	jne	SHORT $LL4@decompress
	mov	rbp, QWORD PTR [rsp+40]
	mov	rbx, QWORD PTR [rsp+32]
$LN3@decompress:

; 573  : 					_MM_OR_SI128_OP4(base, r, g, b));
; 574  : 		}
; 575  : 	}
; 576  : }

	mov	rsi, QWORD PTR [rsp+48]
	mov	rdi, QWORD PTR [rsp+56]
	movaps	xmm6, XMMWORD PTR [rsp]
	add	rsp, 16
	pop	r14
	ret	0
decompress_r10l ENDP
_TEXT	ENDS

## decompress_r10l.c
#include <xmmintrin.h>
#include <emmintrin.h>

#define _MM_OR_SI128_OP3(x, y, z)    _mm_or_si128(_mm_or_si128(x, y), z)
#define _MM_OR_SI128_OP4(x, y, z, w) _mm_or_si128(_MM_OR_SI128_OP3(x, y, z), w)
#define _MM_AND_SLLI_EPI32(x, m, i)  _mm_slli_epi32(_mm_and_si128(x, m), i)
#define _MM_AND_SRLI_EPI32(x, m, i)  _mm_srli_epi32(_mm_and_si128(x, m), i)

void decompress_r10l(
		const uint8_t *input, const uint32_t in_linesize,
		uint32_t start_y, uint32_t end_y,
		uint8_t *output, uint32_t out_linesize)
{
	uint32_t width_d4 = min_uint32(in_linesize, out_linesize) / 16;
	uint32_t y;

	const __m128i base   = _mm_set1_epi32(0xC0000000);
	const __m128i mask_r = _mm_set1_epi32(0xFFC00000);
	const __m128i mask_g = _mm_set1_epi32(0x003FF000);
	const __m128i mask_b = _mm_set1_epi32(0x00000FFC);

	for (y = start_y; y < end_y; y++) {
		const __m128i *input0;
		register __m128i *output0;
		uint32_t x;

		input0 = (const __m128i*)(input + y * in_linesize);
		output0 = (__m128i*)(output + y * out_linesize);

		for (x = 0; x < width_d4; x++) {
			const __m128i tmp = _mm_load_si128(input0++);
			const __m128i r = _MM_AND_SRLI_EPI32(tmp, mask_r, 22);
			const __m128i g = _MM_AND_SRLI_EPI32(tmp, mask_g,  2);
			const __m128i b = _MM_AND_SLLI_EPI32(tmp, mask_b, 18);

			_mm_stream_si128(output0++,
					_MM_OR_SI128_OP4(base, r, g, b));
		}
	}
}
	; Function compile flags: /Ogtpy
	; File
	; COMDAT decompress_r10l
	_TEXT SEGMENT
	input$ = 32
	in_linesize$ = 40
	start_y$ = 48
	end_y$ = 56
	output$ = 64
	out_linesize$ = 72
	decompress_r10l PROC ; COMDAT

	; 549 : {

	$LN20:
	mov QWORD PTR [rsp+24], rsi
	mov QWORD PTR [rsp+32], rdi
	push r14
	sub rsp, 16

	; 94 : return a < b ? a : b;

	mov esi, DWORD PTR out_linesize$[rsp]

	; 549 : {

	mov eax, r9d

	; 551 : uint32_t y;
	; 552 :
	; 553 : const __m128i base = _mm_set1_epi32(0xC0000000);

	movdqa xmm3, XMMWORD PTR __xmm@c0000000c0000000c0000000c0000000

	; 94 : return a < b ? a : b;

	cmp edx, esi

	; 554 : const __m128i mask_r = _mm_set1_epi32(0xFFC00000);

	movdqa xmm4, XMMWORD PTR __xmm@ffc00000ffc00000ffc00000ffc00000

	; 94 : return a < b ? a : b;

	mov r11d, esi

	; 555 : const __m128i mask_g = _mm_set1_epi32(0x003FF000);

	movdqa xmm5, XMMWORD PTR __xmm@003ff000003ff000003ff000003ff000

	; 94 : return a < b ? a : b;

	cmovb r11d, edx

	; 550 : uint32_t width_d4 = min_uint32(in_linesize, out_linesize) / 16;

	shr r11d, 4
	mov r10d, r8d
	movaps XMMWORD PTR [rsp], xmm6
	mov edi, edx

	; 556 : const __m128i mask_b = _mm_set1_epi32(0x00000FFC);

	movdqa xmm6, XMMWORD PTR __xmm@00000ffc00000ffc00000ffc00000ffc
	mov r14, rcx

	; 557 :
	; 558 : for (y = start_y; y < end_y; y++) {

	cmp r8d, r9d
	jae $LN3@decompress
	mov QWORD PTR [rsp+32], rbx

	; 94 : return a < b ? a : b;

	mov r9d, edx
	imul r9d, r10d
	imul r8d, esi
	sub eax, r10d
	mov QWORD PTR [rsp+40], rbp
	mov rbp, QWORD PTR output$[rsp]
	mov ebx, eax
	npad 8
	$LL4@decompress:

	; 559 : const __m128i *input0;
	; 560 : register __m128i *output0;
	; 561 : uint32_t x;
	; 562 :
	; 563 : input0 = (const __m128i)(input + y in_linesize);

	mov edx, r9d

	; 564 : output0 = (__m128i)(output + y out_linesize);

	mov eax, r8d
	add rdx, r14
	add rax, rbp

	; 565 :
	; 566 : for (x = 0; x < width_d4; x++) {

	test r11d, r11d
	je SHORT $LN2@decompress

	; 559 : const __m128i *input0;
	; 560 : register __m128i *output0;
	; 561 : uint32_t x;
	; 562 :
	; 563 : input0 = (const __m128i)(input + y in_linesize);

	mov r10d, r11d
	npad 12
	$LL7@decompress:

	; 567 : const __m128i tmp = _mm_load_si128(input0++);

	mov rcx, rdx
	add rdx, 16
	movdqa xmm2, XMMWORD PTR [rcx]

	; 568 : const __m128i r = _MM_AND_SRLI_EPI32(tmp, mask_r, 22);
	; 569 : const __m128i g = _MM_AND_SRLI_EPI32(tmp, mask_g, 2);
	; 570 : const __m128i b = _MM_AND_SLLI_EPI32(tmp, mask_b, 18);
	; 571 :
	; 572 : _mm_stream_si128(output0++,

	mov rcx, rax
	movdqa xmm1, xmm2
	movdqa xmm0, xmm2
	pand xmm1, xmm4
	pand xmm0, xmm5
	psrld xmm1, 22
	pand xmm2, xmm6
	por xmm1, xmm3
	psrld xmm0, 2
	por xmm1, xmm0
	pslld xmm2, 18
	add rax, 16
	por xmm1, xmm2
	movntdq XMMWORD PTR [rcx], xmm1
	sub r10, 1
	jne SHORT $LL7@decompress
	$LN2@decompress:

	; 557 :
	; 558 : for (y = start_y; y < end_y; y++) {

	add r9d, edi
	add r8d, esi
	sub rbx, 1
	jne SHORT $LL4@decompress
	mov rbp, QWORD PTR [rsp+40]
	mov rbx, QWORD PTR [rsp+32]
	$LN3@decompress:

	; 573 : _MM_OR_SI128_OP4(base, r, g, b));
	; 574 : }
	; 575 : }
	; 576 : }

	mov rsi, QWORD PTR [rsp+48]
	mov rdi, QWORD PTR [rsp+56]
	movaps xmm6, XMMWORD PTR [rsp]
	add rsp, 16
	pop r14
	ret 0
	decompress_r10l ENDP
	_TEXT ENDS
	#include <xmmintrin.h>
	#include <emmintrin.h>

	#define _MM_OR_SI128_OP3(x, y, z) _mm_or_si128(_mm_or_si128(x, y), z)
	#define _MM_OR_SI128_OP4(x, y, z, w) _mm_or_si128(_MM_OR_SI128_OP3(x, y, z), w)
	#define _MM_AND_SLLI_EPI32(x, m, i) _mm_slli_epi32(_mm_and_si128(x, m), i)
	#define _MM_AND_SRLI_EPI32(x, m, i) _mm_srli_epi32(_mm_and_si128(x, m), i)

	void decompress_r10l(
	const uint8_t *input, const uint32_t in_linesize,
	uint32_t start_y, uint32_t end_y,
	uint8_t *output, uint32_t out_linesize)
	{
	uint32_t width_d4 = min_uint32(in_linesize, out_linesize) / 16;
	uint32_t y;

	const __m128i base = _mm_set1_epi32(0xC0000000);
	const __m128i mask_r = _mm_set1_epi32(0xFFC00000);
	const __m128i mask_g = _mm_set1_epi32(0x003FF000);
	const __m128i mask_b = _mm_set1_epi32(0x00000FFC);

	for (y = start_y; y < end_y; y++) {
	const __m128i *input0;
	register __m128i *output0;
	uint32_t x;

	input0 = (const __m128i)(input + y in_linesize);
	output0 = (__m128i)(output + y out_linesize);

	for (x = 0; x < width_d4; x++) {
	const __m128i tmp = _mm_load_si128(input0++);
	const __m128i r = _MM_AND_SRLI_EPI32(tmp, mask_r, 22);
	const __m128i g = _MM_AND_SRLI_EPI32(tmp, mask_g, 2);
	const __m128i b = _MM_AND_SLLI_EPI32(tmp, mask_b, 18);

	_mm_stream_si128(output0++,
	_MM_OR_SI128_OP4(base, r, g, b));
	}
	}
	}