Created
January 19, 2017 02:19
-
-
Save lazear/c43b476d641266f4f89328dc2a58c438 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; Learning how to use Streaming SIMD Extensions 4.2 | |
[bits 64] | |
section .text | |
;=============================== | |
; IMM8 Control Byte Operation | |
; | |
;***** Source Data Format | |
; Both 128 bit sources treated as... | |
; [1:0] = 00b, packed unsigned bytes | |
; [1:0] = 01b, packed unsigned words | |
; [1:0] = 10b, packed signed bytes | |
; [1:0] = 11b, packed signed words | |
;***** Aggregation Operation | |
; Mode of per-element comparison | |
; [3:2] = 00b, "equal any" | |
; [3:2] = 01b, "ranges" | |
; [3:2] = 10b, "equal each" | |
; [3:2] = 11b, "equal ordered" | |
; ** Equal any: find characters from a set | |
; ** First operand is character set, second is string | |
; Range: | |
; First operand is character set, second is string | |
; IntRes1 |= (second[i] > first[j]) & (second[i] < first[j+1]) | |
; ** Equal each: compare two strings, byte by byte | |
; ** Equal ordered: substring search | |
;***** Polarity | |
; Specifies intermediate processing to be performed | |
; [5:4] = 00b, Positive polarity, IntRes2 = IntRes1 | |
; [5:4] = 01b, Negative polarity, IntRes2 = -1 XOR IntRes1 | |
; [5:4] = 10b, Mask (+), IntRes2 = IntRes1 | |
; [5:4] = 11b, Mask (-), IntRes2[i] = IntRes1[i] if src[i] invalid, | |
; else IntRes2[i] = ~IntRes1[i] | |
;***** Output selection | |
; Specifies final operation to produce the output | |
; [6] = 0/1 Least or Most significant index for STRI | |
; [6] = 0/1 Bit mask or byte/word mask for STRM | |
global sse42_enabled | |
sse42_enabled: | |
mov rax, 1 | |
cpuid | |
; and ecx, (1<<20) | |
xor rax, rax | |
test ecx, (1<<20) | |
setne al | |
ret | |
global sse42_strlen | |
sse42_strlen: | |
xor rax, rax | |
pxor xmm0, xmm0 | |
mov rax, -16 | |
.loop: | |
; imm[1:0] = 0, source data is unsigned bytes | |
; imm[3:2] = 2, equal each aggregation | |
; imm[5:4] = 0, positive polarity, res2 = res1 | |
; imm[6] = 0, ecx is LS bit | |
add rax, 16 | |
; packed compare implicit length strings, return index (in ECX) | |
; Zero Flag set if any byte in source operand is zero | |
pcmpistri xmm0, [rdi + rax], 001000b | |
jnz .loop | |
; rcx contains offset from [rdi+rax] where NULL terminator | |
; is found | |
add rax, rcx | |
ret | |
global sse42_strcmp | |
sse42_strcmp: | |
pxor xmm0, xmm0 | |
pxor xmm1, xmm1 | |
xor rax, rax | |
mov rax, -16 | |
.loop: | |
add rax, 16 | |
movdqu xmm1, [rdi+rax] | |
; Packed Compare Implicit Length Strings, return index (in ECX) | |
; Negative polarity, differing characters have value of 1 | |
; Least significant bit flag set, so first differing index is in ECX | |
pcmpistri xmm1, [rsi+rax], 0011000b | |
; zero flag set if any byte == NULL | |
jnz .loop | |
; carry flag set if IntRes2 == 0 | |
jc .diff | |
xor rax, rax | |
ret | |
.diff: | |
movzx rax, byte [rdi+rcx] | |
movzx rdx, byte [rsi+rcx] | |
sub rax, rdx | |
ret | |
;;; Return byte mask for characters | |
global sse42_strcmp_mask | |
sse42_strcmp_mask: | |
pxor xmm0, xmm0 | |
pxor xmm1, xmm1 | |
xor rax, rax | |
mov rax, -16 | |
.loop: | |
add rax, 16 | |
movdqu xmm1, [rdi+rax] | |
; Packed Compare Implicit Length Strings, return Mask (in XMM0) | |
; Negative polarity, byte mask, equal each | |
pcmpistrm xmm1, [rsi+rax], 1011000b | |
; zero flag set if any byte == NULL | |
jnz .loop | |
; Zero flag has been set, we've reached implicit end of string. | |
pmovmskb eax, xmm0 | |
ret | |
;;; Return byte mask for characters | |
global sse42_strstr_mask | |
sse42_strstr_mask: | |
pxor xmm0, xmm0 | |
pxor xmm1, xmm1 | |
xor rax, rax | |
mov rax, -16 | |
mov rcx, 0 | |
.loop: | |
add rax, 16 | |
movdqu xmm1, [rsi+rcx] | |
; Packed Compare Implicit Length Strings, return Mask (in XMM0) | |
; Positive polarity, byte mask, equal ordered | |
pcmpistrm xmm1, [rdi+rax], 1001100b | |
; zero flag set if any byte == NULL | |
jnc .loop | |
; Transfer a byte mask from XMM to EAX | |
; If MSB of each byte in XMM0 is 1, then set corresponding index bit in EAX | |
pmovmskb eax, xmm0 | |
popcnt eax, eax | |
; popcnt eax, eax would give us how many times the substring occurred | |
ret | |
global sse42_test | |
sse42_test: | |
pxor xmm0, xmm0 ; clear xmm0 | |
pxor xmm1, xmm1 | |
;mov rdi, 0xcbaed | |
movq xmm0, rdi ; mov qword from rdi to low bits of xmm0 | |
mov rdx, 01110111b | |
movq xmm1, rdx | |
pshufb xmm0, xmm1 | |
movdqu [rsi], xmm0 | |
; movlhps xmm1, xmm0 ; mov low qword packed from xmm0 to high qword of xmm 1 | |
; por xmm1, xmm0 ; xmm1 |= xmm0, xmm1 = 0x123456789ABCDEF0123456789ABCDEF0 | |
; movdqa xmm2, xmm0 ; xmm2 = xmm0,should = 0x0000000000000000123456789ABCDEF0 | |
; ; xmm1 = 0x123456789ABCDEF0123456789ABCDEF0 | |
; ; |low dword| |high dword| | |
; ; |low quadword |high quadword | | |
; ; xmm2 = 0x123456789ABCDEF00000000000000000 | |
; ; inerleave bytes from low doublewords of xmm2 and xmm1 into xmm2 | |
; ; punpcklwd xmm2, xmm1 | |
; ; xmm2 shold be 0x1212343456567878 9A9ABCBCDEDEF0F0 | |
; movdqu [rsi], xmm1 ; mov 128 bits from xmm1 to unaligned memory location rsi | |
mov rax, rsi ; return memory location | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
global sse42_memcpy_aligned:
sse42_memcpy_aligned:
pxor xmm0, xmm0
mov r8, rdx
xor rdx, rdx
mov rcx, 16
mov rax, r8
div rcx
; result in rdx:rax
mov rcx, rdx