Last active
September 16, 2023 16:56
-
-
Save december1981/227301e2916f46b126e90511e8941eed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; demo adding an array of u16s and accumulating them into a s32 | |
; we load 8 bytes at a time (4 words) into a simd register and use simd instructions to accumulate. | |
; compile with nasm for elf64 target to executable "accum" | |
;; nasm -felf64 -o accum.o accum.s | |
;; ld --dynamic-linker /lib64/ld-linux-x86-64.so.2 accum.o -o accum -lc | |
;; | |
global _start | |
; standard libc printf | |
extern printf | |
section .data | |
message: db "Accumulated result: %#x", 10, 0 | |
align 8 | |
zero: dq 0x0 | |
array: dw 0x1111, 0x2222, 0x3333, 0x4444, 0x1111, 0x8888, 0x9999, 0x9999, 0x1111, 0x2222, 0x7777, 0x4444, 0x1111, 0x3214, 0x9999, 0x4444, 0x1111 | |
els: equ ($ - array)/2 | |
; number of loops for the vectored calculation (process 4 words at a time) | |
vels: equ els / 4 | |
; remainder items where we just process 1 word at a time (max 3 iterations) | |
rels: equ els - vels * 4 | |
section .text | |
_start: | |
mov rsi, array | |
movq xmm1, [zero] | |
; clear top 32 bits of rcx in the process | |
mov rcx, vels | |
cmp ecx, 0 | |
je _add_remainder | |
; set our end offset to compare for loop | |
lea rdi, [rcx * 8 + array] | |
_vadd_loop: | |
movq xmm0, [rsi] | |
; this will do the zero extension from words to doubles | |
pmovzxwd xmm0, xmm0 | |
; sum our words together in two of these horizontal add instructions | |
phaddd xmm0, xmm0 | |
phaddd xmm0, xmm0 | |
; loop accumulate lower s32 in xmm1 | |
paddd xmm1, xmm0 | |
add rsi, 8 | |
cmp rsi, rdi | |
jl _vadd_loop | |
_add_remainder: | |
; extract our accumulated s32 (will be ebx) | |
movd ebx, xmm1 | |
; add remainder elements using standard loop every 2 bytes | |
; this loop iteration will be at most 3 (from the specific data above, 1 iteration) | |
mov ecx, rels | |
cmp ecx, 0 | |
je _show_message | |
_add_loop: | |
movzx eax, word [rsi] | |
add ebx, eax | |
add rsi, 2 | |
loop _add_loop | |
_show_message: | |
lea rdi, [rel message] | |
xor eax, eax | |
mov esi, ebx | |
call printf wrt ..plt | |
;; exit (sys call 60) with 0 return code. | |
mov rax, 60 | |
mov edi, 0 | |
syscall |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
We can process 8 words at a time with the following 256 bit vex instructions.