public
Created

MMX v/s SSE2 Performance Comparison

  • Download Gist
gistfile1.asm
Assembly
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
.model C,flat
 
.data
 
.code
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; MMX ASSEMBLY
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
euclidean_MMX proc
mov eax,[esp+1*4] ; x_ptr in eax
mov ebx,[esp+2*4] ; y_ptr in ebx
mov ecx,[esp+3*4] ; counter in ecx
shr ecx,3; ; divide counter by 8
 
pxor mm7,mm7 ; clear mm7 to store 0
 
MainLoop:
;
; First find X^2
;
movq mm0,qword ptr[eax] ; 8 bytes from EAX
movq mm1,mm0 ; make a copy of mm0
 
punpckhbw mm1,mm7 ; convert B -> W for upper half
pmullw mm1,mm1 ; square and keep the lower order
 
punpcklbw mm0,mm7 ; convert B -> W for lower half
pmullw mm0,mm0 ; square and keep the lower order
packuswb mm0,mm1 ; pack words into bytes
; mm0 has X2
 
;
;Now Find Y^2
;
movq mm1,qword ptr[ebx] ; 8 bytes from EAX
movq mm2,mm1 ; make a copy of mm1
 
punpckhbw mm2,mm7 ; convert B -> W for upper half
pmullw mm2,mm2 ; square and keep the lower order
 
punpcklbw mm1,mm7 ; convert B -> W for lower half
pmullw mm1,mm1 ; square and keep the lower order
packuswb mm1,mm2 ; pack words into bytes
; mm1 has Y^2
 
; Now add them
paddb mm0,mm1 ; z2 = x2+y2
movq [eax],mm0 ; store back the results
add eax,8 ; move eax by 8 bytes
add ebx,8 ; move ebx by 8 bytes
dec ecx
jnz MainLoop
emms
ret
euclidean_MMX endp
 
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; SSE2 ASSEMBLY
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
euclidean_SSE proc
mov eax,[esp+1*4] ; x_ptr in eax
mov ebx,[esp+2*4] ; y_ptr in ebx
mov ecx,[esp+3*4] ; counter in ecx
shr ecx,4 ; divide counter by 16
 
pxor xmm7,xmm7 ; clear mm7 to store 0
 
MainLoop:
;
; First do with X vector
;
movdqa xmm0,xmmword ptr[eax]; 16 bytes from EAX
movdqa xmm1,xmm0 ; make a copy of xmm0
 
punpckhbw xmm1,xmm7 ; convert B -> W for upper half
pmullw xmm1,xmm1 ; square and keep the lower order
 
punpcklbw xmm0,xmm7 ; convert B -> W for lower half
pmullw xmm0,xmm0 ; square and keep the lower order
packuswb xmm0,xmm1 ; pack words into bytes
; xmm0 has X2
 
;
;Now with Y vector
;
movdqa xmm1,xmmword ptr[ebx]; 8 bytes from EAX
movdqa xmm2,xmm1 ; make a copy of mm0
 
punpckhbw xmm2,xmm7 ; convert B -> W for upper half
pmullw xmm2,xmm2 ; square and keep the lower order
 
punpcklbw xmm1,xmm7 ; convert B -> W for lower half
pmullw xmm1,xmm1 ; square and keep the lower order
packuswb xmm1,xmm2 ; pack words into bytes
; mm1 has Y2
 
; Now add them
paddb xmm0,xmm1 ; z2 = x2+y2
movdqa [eax],xmm0 ; store back the results
add eax,16 ; move eax by 16 bytes
add ebx,16 ; move ebx by 16 bytes
dec ecx
jnz MainLoop
emms
ret
euclidean_SSE endp
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.