Created
March 31, 2013 18:08
-
-
Save abidrahmank/5281486 to your computer and use it in GitHub Desktop.
MMX v/s SSE2 Performance Comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.model C,flat | |
.data | |
.code | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; MMX ASSEMBLY | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
euclidean_MMX proc | |
mov eax,[esp+1*4] ; x_ptr in eax | |
mov ebx,[esp+2*4] ; y_ptr in ebx | |
mov ecx,[esp+3*4] ; counter in ecx | |
shr ecx,3; ; divide counter by 8 | |
pxor mm7,mm7 ; clear mm7 to store 0 | |
MainLoop: | |
; | |
; First find X^2 | |
; | |
movq mm0,qword ptr[eax] ; 8 bytes from EAX | |
movq mm1,mm0 ; make a copy of mm0 | |
punpckhbw mm1,mm7 ; convert B -> W for upper half | |
pmullw mm1,mm1 ; square and keep the lower order | |
punpcklbw mm0,mm7 ; convert B -> W for lower half | |
pmullw mm0,mm0 ; square and keep the lower order | |
packuswb mm0,mm1 ; pack words into bytes | |
; mm0 has X2 | |
; | |
;Now Find Y^2 | |
; | |
movq mm1,qword ptr[ebx] ; 8 bytes from EAX | |
movq mm2,mm1 ; make a copy of mm1 | |
punpckhbw mm2,mm7 ; convert B -> W for upper half | |
pmullw mm2,mm2 ; square and keep the lower order | |
punpcklbw mm1,mm7 ; convert B -> W for lower half | |
pmullw mm1,mm1 ; square and keep the lower order | |
packuswb mm1,mm2 ; pack words into bytes | |
; mm1 has Y^2 | |
; Now add them | |
paddb mm0,mm1 ; z2 = x2+y2 | |
movq [eax],mm0 ; store back the results | |
add eax,8 ; move eax by 8 bytes | |
add ebx,8 ; move ebx by 8 bytes | |
dec ecx | |
jnz MainLoop | |
emms | |
ret | |
euclidean_MMX endp | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; SSE2 ASSEMBLY | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
euclidean_SSE proc | |
mov eax,[esp+1*4] ; x_ptr in eax | |
mov ebx,[esp+2*4] ; y_ptr in ebx | |
mov ecx,[esp+3*4] ; counter in ecx | |
shr ecx,4 ; divide counter by 16 | |
pxor xmm7,xmm7 ; clear mm7 to store 0 | |
MainLoop: | |
; | |
; First do with X vector | |
; | |
movdqa xmm0,xmmword ptr[eax]; 16 bytes from EAX | |
movdqa xmm1,xmm0 ; make a copy of xmm0 | |
punpckhbw xmm1,xmm7 ; convert B -> W for upper half | |
pmullw xmm1,xmm1 ; square and keep the lower order | |
punpcklbw xmm0,xmm7 ; convert B -> W for lower half | |
pmullw xmm0,xmm0 ; square and keep the lower order | |
packuswb xmm0,xmm1 ; pack words into bytes | |
; xmm0 has X2 | |
; | |
;Now with Y vector | |
; | |
movdqa xmm1,xmmword ptr[ebx]; 8 bytes from EAX | |
movdqa xmm2,xmm1 ; make a copy of mm0 | |
punpckhbw xmm2,xmm7 ; convert B -> W for upper half | |
pmullw xmm2,xmm2 ; square and keep the lower order | |
punpcklbw xmm1,xmm7 ; convert B -> W for lower half | |
pmullw xmm1,xmm1 ; square and keep the lower order | |
packuswb xmm1,xmm2 ; pack words into bytes | |
; mm1 has Y2 | |
; Now add them | |
paddb xmm0,xmm1 ; z2 = x2+y2 | |
movdqa [eax],xmm0 ; store back the results | |
add eax,16 ; move eax by 16 bytes | |
add ebx,16 ; move ebx by 16 bytes | |
dec ecx | |
jnz MainLoop | |
emms | |
ret | |
euclidean_SSE endp | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment