Last active
September 16, 2023 16:56
-
-
Save december1981/227301e2916f46b126e90511e8941eed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; demo adding an array of u16s and accumulating them into a s32 | |
; we load 8 bytes at a time (4 words) into a simd register and use simd instructions to accumulate. | |
; compile with nasm for elf64 target to executable "accum" | |
;; nasm -felf64 -o accum.o accum.s | |
;; ld --dynamic-linker /lib64/ld-linux-x86-64.so.2 accum.o -o accum -lc | |
;; | |
global _start | |
; standard libc printf | |
extern printf | |
section .data | |
message: db "Accumulated result: %#x", 10, 0 | |
align 8 | |
zero: dq 0x0 | |
array: dw 0x1111, 0x2222, 0x3333, 0x4444, 0x1111, 0x8888, 0x9999, 0x9999, 0x1111, 0x2222, 0x7777, 0x4444, 0x1111, 0x3214, 0x9999, 0x4444, 0x1111 | |
els: equ ($ - array)/2 | |
; number of loops for the vectored calculation (process 4 words at a time) | |
vels: equ els / 4 | |
; remainder items where we just process 1 word at a time (max 3 iterations) | |
rels: equ els - vels * 4 | |
section .text | |
_start: | |
mov rsi, array | |
movq xmm1, [zero] | |
; clear top 32 bits of rcx in the process | |
mov rcx, vels | |
cmp ecx, 0 | |
je _add_remainder | |
; set our end offset to compare for loop | |
lea rdi, [rcx * 8 + array] | |
_vadd_loop: | |
movq xmm0, [rsi] | |
; this will do the zero extension from words to doubles | |
pmovzxwd xmm0, xmm0 | |
; sum our words together in two of these horizontal add instructions | |
phaddd xmm0, xmm0 | |
phaddd xmm0, xmm0 | |
; loop accumulate lower s32 in xmm1 | |
paddd xmm1, xmm0 | |
add rsi, 8 | |
cmp rsi, rdi | |
jl _vadd_loop | |
_add_remainder: | |
; extract our accumulated s32 (will be ebx) | |
movd ebx, xmm1 | |
; add remainder elements using standard loop every 2 bytes | |
; this loop iteration will be at most 3 (from the specific data above, 1 iteration) | |
mov ecx, rels | |
cmp ecx, 0 | |
je _show_message | |
_add_loop: | |
movzx eax, word [rsi] | |
add ebx, eax | |
add rsi, 2 | |
loop _add_loop | |
_show_message: | |
lea rdi, [rel message] | |
xor eax, eax | |
mov esi, ebx | |
call printf wrt ..plt | |
;; exit (sys call 60) with 0 return code. | |
mov rax, 60 | |
mov edi, 0 | |
syscall |
Fixed the original to now use pmovzxwd - however, requiring to grab 4 words at a time from memory instead of 8.
We can process 8 words at a time with the following 256 bit vex instructions.
vels: equ els / 8
rels: equ els - vels * 8
...
; lea rdi, [rcx * 16 + array]
; but * 16 is not allowed with lea, so double ecx and do it * 8...
add ecx, ecx
lea rdi, [rcx * 8 + array]
_vadd_loop:
vmovdqu xmm0, [rsi]
; this will do the zero extension from 8 words to doubles, extending the full 256 bit destination
vpmovzxwd ymm0, xmm0
; sum our words together in two of these horizontal add instructions
vphaddd ymm0, ymm0
vphaddd ymm0, ymm0
; this will put the top 128 bits of ymm0 into lower 128 bits of ymm2. Is there a faster way?
vperm2i128 ymm2, ymm0, ymm0, 0b1
; loop accumulate ymm0
vpaddd ymm1, ymm0
; loop accumulate ymm2
vpaddd ymm1, ymm2
add rsi, 16
cmp rsi, rdi
jl _vadd_loop
; Good practice (I believe?) after vex instruction mode (vpaddd ymm/zmm, etc) to execute this instruction
; otherwise there can be unpleasant penalties for sse instructions (ie paddd xmm... etc) going forward
vzeroupper
...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This gist's original revision had a flaw in that the unsigned mask (no longer used in latest gist) would clobber valid accumulated bits into the top 16bits of the sign extended results. Ideally there would have been an zero extended version of the multiply add instruction.