Skip to content

Instantly share code, notes, and snippets.

@lazear
Created January 19, 2017 02:19
Show Gist options
  • Save lazear/c43b476d641266f4f89328dc2a58c438 to your computer and use it in GitHub Desktop.
Save lazear/c43b476d641266f4f89328dc2a58c438 to your computer and use it in GitHub Desktop.
;;; Learning how to use Streaming SIMD Extensions 4.2
[bits 64]
section .text
;===============================
; IMM8 Control Byte Operation
;
;***** Source Data Format
; Both 128 bit sources treated as...
; [1:0] = 00b, packed unsigned bytes
; [1:0] = 01b, packed unsigned words
; [1:0] = 10b, packed signed bytes
; [1:0] = 11b, packed signed words
;***** Aggregation Operation
; Mode of per-element comparison
; [3:2] = 00b, "equal any"
; [3:2] = 01b, "ranges"
; [3:2] = 10b, "equal each"
; [3:2] = 11b, "equal ordered"
; ** Equal any: find characters from a set
; ** First operand is character set, second is string
; Range:
; First operand is character set, second is string
; IntRes1 |= (second[i] > first[j]) & (second[i] < first[j+1])
; ** Equal each: compare two strings, byte by byte
; ** Equal ordered: substring search
;***** Polarity
; Specifies intermediate processing to be performed
; [5:4] = 00b, Positive polarity, IntRes2 = IntRes1
; [5:4] = 01b, Negative polarity, IntRes2 = -1 XOR IntRes1
; [5:4] = 10b, Mask (+), IntRes2 = IntRes1
; [5:4] = 11b, Mask (-), IntRes2[i] = IntRes1[i] if src[i] invalid,
; else IntRes2[i] = ~IntRes1[i]
;***** Output selection
; Specifies final operation to produce the output
; [6] = 0/1 Least or Most significant index for STRI
; [6] = 0/1 Bit mask or byte/word mask for STRM
global sse42_enabled
sse42_enabled:
mov rax, 1
cpuid
; and ecx, (1<<20)
xor rax, rax
test ecx, (1<<20)
setne al
ret
global sse42_strlen
sse42_strlen:
xor rax, rax
pxor xmm0, xmm0
mov rax, -16
.loop:
; imm[1:0] = 0, source data is unsigned bytes
; imm[3:2] = 2, equal each aggregation
; imm[5:4] = 0, positive polarity, res2 = res1
; imm[6] = 0, ecx is LS bit
add rax, 16
; packed compare implicit length strings, return index (in ECX)
; Zero Flag set if any byte in source operand is zero
pcmpistri xmm0, [rdi + rax], 001000b
jnz .loop
; rcx contains offset from [rdi+rax] where NULL terminator
; is found
add rax, rcx
ret
global sse42_strcmp
sse42_strcmp:
pxor xmm0, xmm0
pxor xmm1, xmm1
xor rax, rax
mov rax, -16
.loop:
add rax, 16
movdqu xmm1, [rdi+rax]
; Packed Compare Implicit Length Strings, return index (in ECX)
; Negative polarity, differing characters have value of 1
; Least significant bit flag set, so first differing index is in ECX
pcmpistri xmm1, [rsi+rax], 0011000b
; zero flag set if any byte == NULL
jnz .loop
; carry flag set if IntRes2 == 0
jc .diff
xor rax, rax
ret
.diff:
movzx rax, byte [rdi+rcx]
movzx rdx, byte [rsi+rcx]
sub rax, rdx
ret
;;; Return byte mask for characters
global sse42_strcmp_mask
sse42_strcmp_mask:
pxor xmm0, xmm0
pxor xmm1, xmm1
xor rax, rax
mov rax, -16
.loop:
add rax, 16
movdqu xmm1, [rdi+rax]
; Packed Compare Implicit Length Strings, return Mask (in XMM0)
; Negative polarity, byte mask, equal each
pcmpistrm xmm1, [rsi+rax], 1011000b
; zero flag set if any byte == NULL
jnz .loop
; Zero flag has been set, we've reached implicit end of string.
pmovmskb eax, xmm0
ret
;;; Return byte mask for characters
global sse42_strstr_mask
sse42_strstr_mask:
pxor xmm0, xmm0
pxor xmm1, xmm1
xor rax, rax
mov rax, -16
mov rcx, 0
.loop:
add rax, 16
movdqu xmm1, [rsi+rcx]
; Packed Compare Implicit Length Strings, return Mask (in XMM0)
; Positive polarity, byte mask, equal ordered
pcmpistrm xmm1, [rdi+rax], 1001100b
; zero flag set if any byte == NULL
jnc .loop
; Transfer a byte mask from XMM to EAX
; If MSB of each byte in XMM0 is 1, then set corresponding index bit in EAX
pmovmskb eax, xmm0
popcnt eax, eax
; popcnt eax, eax would give us how many times the substring occurred
ret
global sse42_test
sse42_test:
pxor xmm0, xmm0 ; clear xmm0
pxor xmm1, xmm1
;mov rdi, 0xcbaed
movq xmm0, rdi ; mov qword from rdi to low bits of xmm0
mov rdx, 01110111b
movq xmm1, rdx
pshufb xmm0, xmm1
movdqu [rsi], xmm0
; movlhps xmm1, xmm0 ; mov low qword packed from xmm0 to high qword of xmm 1
; por xmm1, xmm0 ; xmm1 |= xmm0, xmm1 = 0x123456789ABCDEF0123456789ABCDEF0
; movdqa xmm2, xmm0 ; xmm2 = xmm0,should = 0x0000000000000000123456789ABCDEF0
; ; xmm1 = 0x123456789ABCDEF0123456789ABCDEF0
; ; |low dword| |high dword|
; ; |low quadword |high quadword |
; ; xmm2 = 0x123456789ABCDEF00000000000000000
; ; inerleave bytes from low doublewords of xmm2 and xmm1 into xmm2
; ; punpcklwd xmm2, xmm1
; ; xmm2 shold be 0x1212343456567878 9A9ABCBCDEDEF0F0
; movdqu [rsi], xmm1 ; mov 128 bits from xmm1 to unaligned memory location rsi
mov rax, rsi ; return memory location
ret
@lazear
Copy link
Author

lazear commented Jan 19, 2017

global sse42_memcpy_aligned:
sse42_memcpy_aligned:
pxor xmm0, xmm0
mov r8, rdx
xor rdx, rdx
mov rcx, 16
mov rax, r8
div rcx
; result in rdx:rax
mov rcx, rdx

test rax, rax 
je .done

lea rax, [(rax*8)+.JT]
jmp [rax]

.JT:
	dq .done, .M0, .M1, .M2, .M3

.M3:
	movdqa xmm0, [rsi+48]
	movdqa [rdi+48], xmm0
.M2:
	movdqa xmm0, [rsi+32]
	movdqa [rdi+32], xmm0
.M1:
	movdqa xmm0, [rsi+16]
	movdqa [rdi+16], xmm0
.M0:
	movdqa xmm0, [rsi]
	movdqa [rdi], xmm0
.done:

	rep movsb
ret

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment