Skip to content

Instantly share code, notes, and snippets.

@KindDragon
Created August 5, 2012 22:32
Show Gist options
  • Save KindDragon/3267532 to your computer and use it in GitHub Desktop.
Save KindDragon/3267532 to your computer and use it in GitHub Desktop.
Test Asm
void Test( float* kernel, float* src, float* dst, int n )
{
in al,dx
and esp,0FFFFFFF0h
sub esp,2Ch
__m128 zero = _mm_setzero_ps();
__m128 x0 = zero;
__m128 x1 = zero;
__m128 x2 = zero;
__m128 x3 = zero;
__m128 f0;
__m128 f1;
__m128 f2;
__m128 f3;
// init filter
__m128 k0 = _mm_loadu_ps(kernel + 0);
__m128 k1 = _mm_loadu_ps(kernel + 4);
movups xmm6,xmmword ptr [ecx+10h]
movups xmm7,xmmword ptr [ecx]
f0 = _mm_shuffle_ps(k1, k1, _MM_SHUFFLE(0, 1, 2, 3));
movaps xmm1,xmm6
shufps xmm1,xmm6,1Bh
xorps xmm0,xmm0
movaps xmmword ptr [esp+0Ch],xmm1
f1 = _mm_shuffle_ps(k0, k0, _MM_SHUFFLE(0, 1, 2, 3));
movaps xmm1,xmm7
shufps xmm1,xmm7,1Bh
f2 = _mm_move_ss(k0, k1);
movss xmm7,xmm6
f2 = _mm_shuffle_ps(f2, f2, _MM_SHUFFLE(0, 3, 2, 1));
f3 = _mm_move_ss(k1, zero);
movss xmm6,xmm0
push esi
movaps xmm5,xmm0
movaps xmm2,xmm0
movaps xmm3,xmm0
movaps xmm4,xmm0
movaps xmmword ptr [esp+20h],xmm1
shufps xmm7,xmm7,39h
f3 = _mm_shuffle_ps(f3, f3, _MM_SHUFFLE(0, 3, 2, 1));
shufps xmm6,xmm6,39h
mov ecx,0Eh
lea ecx,[ecx]
x0 = _mm_move_ss(x0, x1);
x0 = _mm_shuffle_ps(x0, x0, _MM_SHUFFLE(0, 3, 2, 1));
x1 = _mm_move_ss(x1, x2);
x1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(0, 3, 2, 1));
x2 = _mm_move_ss(x2, x3);
x2 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(0, 3, 2, 1));
x3 = _mm_move_ss(x3, zero);
x3 = _mm_shuffle_ps(x3, x3, _MM_SHUFFLE(0, 3, 2, 1));
__m128 s = _mm_load1_ps(src++);
movss xmm1,dword ptr [edx]
movss xmm5,xmm2
movss xmm2,xmm3
movss xmm3,xmm4
x0 = _mm_move_ss(x0, x1);
x0 = _mm_shuffle_ps(x0, x0, _MM_SHUFFLE(0, 3, 2, 1));
x1 = _mm_move_ss(x1, x2);
x1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(0, 3, 2, 1));
x2 = _mm_move_ss(x2, x3);
x2 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(0, 3, 2, 1));
x3 = _mm_move_ss(x3, zero);
x3 = _mm_shuffle_ps(x3, x3, _MM_SHUFFLE(0, 3, 2, 1));
__m128 s = _mm_load1_ps(src++);
movss xmm4,xmm0
shufps xmm1,xmm1,0
x0 = _mm_add_ps(x0, _mm_mul_ps(f0, s));
movaps xmm0,xmm1
mulps xmm0,xmmword ptr [esp+10h]
shufps xmm5,xmm5,39h
shufps xmm2,xmm2,39h
shufps xmm3,xmm3,39h
addps xmm5,xmm0
x1 = _mm_add_ps(x1, _mm_mul_ps(f1, s));
movaps xmm0,xmm1
mulps xmm0,xmmword ptr [esp+20h]
add edx,4
shufps xmm4,xmm4,39h
addps xmm2,xmm0
x2 = _mm_add_ps(x2, _mm_mul_ps(f2, s));
movaps xmm0,xmm1
mulps xmm0,xmm7
x3 = _mm_add_ps(x3, _mm_mul_ps(f3, s));
mulps xmm1,xmm6
addps xmm3,xmm0
xorps xmm0,xmm0
addps xmm4,xmm1
dec ecx
jne Test+50h (010B1050h)
// prime
#pragma loop( no_vector )
for(int i=0; i<14; ++i) {
mov esi,dword ptr [n]
mov ecx,dword ptr [dst]
lea esp,[esp]
}
// pipeline
do {
x0 = _mm_move_ss(x0, x1);
x0 = _mm_shuffle_ps(x0, x0, _MM_SHUFFLE(0, 3, 2, 1));
x1 = _mm_move_ss(x1, x2);
x1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(0, 3, 2, 1));
x2 = _mm_move_ss(x2, x3);
x2 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(0, 3, 2, 1));
x3 = _mm_move_ss(x3, zero);
x3 = _mm_shuffle_ps(x3, x3, _MM_SHUFFLE(0, 3, 2, 1));
__m128 s = _mm_load1_ps(src++);
movss xmm1,dword ptr [edx]
xorps xmm0,xmm0
movss xmm5,xmm2
movss xmm2,xmm3
movss xmm3,xmm4
movss xmm4,xmm0
shufps xmm1,xmm1,0
x0 = _mm_add_ps(x0, _mm_mul_ps(f0, s));
movaps xmm0,xmm1
mulps xmm0,xmmword ptr [esp+10h]
shufps xmm5,xmm5,39h
shufps xmm2,xmm2,39h
x1 = _mm_add_ps(x1, _mm_mul_ps(f1, s));
x2 = _mm_add_ps(x2, _mm_mul_ps(f2, s));
x3 = _mm_add_ps(x3, _mm_mul_ps(f3, s));
_mm_store_ss(dst++, x0);
mov eax,ecx
addps xmm5,xmm0
movaps xmm0,xmm1
mulps xmm0,xmmword ptr [esp+20h]
shufps xmm3,xmm3,39h
shufps xmm4,xmm4,39h
add ecx,4
addps xmm2,xmm0
movaps xmm0,xmm1
mulps xmm0,xmm7
mulps xmm1,xmm6
lea edx,[edx+4]
addps xmm3,xmm0
movss dword ptr [eax],xmm5
addps xmm4,xmm1
} while(--n);
dec esi
jne Test+0B0h (010B10B0h)
}
pop esi
mov esp,ebp
pop ebp
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment