Skip to content

Instantly share code, notes, and snippets.

@december1981
Last active September 16, 2023 16:56
Show Gist options
  • Save december1981/227301e2916f46b126e90511e8941eed to your computer and use it in GitHub Desktop.
Save december1981/227301e2916f46b126e90511e8941eed to your computer and use it in GitHub Desktop.
; demo adding an array of u16s and accumulating them into a s32
; we load 8 bytes at a time (4 words) into a simd register and use simd instructions to accumulate.
; compile with nasm for elf64 target to executable "accum"
;; nasm -felf64 -o accum.o accum.s
;; ld --dynamic-linker /lib64/ld-linux-x86-64.so.2 accum.o -o accum -lc
;;
global _start
; standard libc printf
extern printf
section .data
message: db "Accumulated result: %#x", 10, 0
align 8
zero: dq 0x0
array: dw 0x1111, 0x2222, 0x3333, 0x4444, 0x1111, 0x8888, 0x9999, 0x9999, 0x1111, 0x2222, 0x7777, 0x4444, 0x1111, 0x3214, 0x9999, 0x4444, 0x1111
els: equ ($ - array)/2
; number of loops for the vectored calculation (process 4 words at a time)
vels: equ els / 4
; remainder items where we just process 1 word at a time (max 3 iterations)
rels: equ els - vels * 4
section .text
_start:
mov rsi, array
movq xmm1, [zero]
; clear top 32 bits of rcx in the process
mov rcx, vels
cmp ecx, 0
je _add_remainder
; set our end offset to compare for loop
lea rdi, [rcx * 8 + array]
_vadd_loop:
movq xmm0, [rsi]
; this will do the zero extension from words to doubles
pmovzxwd xmm0, xmm0
; sum our words together in two of these horizontal add instructions
phaddd xmm0, xmm0
phaddd xmm0, xmm0
; loop accumulate lower s32 in xmm1
paddd xmm1, xmm0
add rsi, 8
cmp rsi, rdi
jl _vadd_loop
_add_remainder:
; extract our accumulated s32 (will be ebx)
movd ebx, xmm1
; add remainder elements using standard loop every 2 bytes
; this loop iteration will be at most 3 (from the specific data above, 1 iteration)
mov ecx, rels
cmp ecx, 0
je _show_message
_add_loop:
movzx eax, word [rsi]
add ebx, eax
add rsi, 2
loop _add_loop
_show_message:
lea rdi, [rel message]
xor eax, eax
mov esi, ebx
call printf wrt ..plt
;; exit (sys call 60) with 0 return code.
mov rax, 60
mov edi, 0
syscall
@december1981
Copy link
Author

december1981 commented Sep 6, 2023

This gist's original revision had a flaw in that the unsigned mask (no longer used in latest gist) would clobber valid accumulated bits into the top 16bits of the sign extended results. Ideally there would have been an zero extended version of the multiply add instruction.

@december1981
Copy link
Author

december1981 commented Sep 7, 2023

Fixed the original to now use pmovzxwd - however, requiring to grab 4 words at a time from memory instead of 8.

@december1981
Copy link
Author

december1981 commented Sep 7, 2023

We can process 8 words at a time with the following 256 bit vex instructions.

vels: equ els / 8
rels: equ els - vels * 8
...
    ; lea rdi, [rcx * 16 + array]
    ; but * 16 is not allowed with lea, so double ecx and do it * 8...
    add ecx, ecx
    lea rdi, [rcx * 8 + array]
    
    _vadd_loop:
    vmovdqu xmm0, [rsi]
    ; this will do the zero extension from 8 words to doubles, extending the full 256 bit destination
    vpmovzxwd ymm0, xmm0
    ; sum our words together in two of these horizontal add instructions
    vphaddd ymm0, ymm0
    vphaddd ymm0, ymm0
    ; this will put the top 128 bits of ymm0 into lower 128 bits of ymm2. Is there a faster way?
    vperm2i128 ymm2, ymm0, ymm0, 0b1
    ; loop accumulate ymm0
    vpaddd ymm1, ymm0
    ; loop accumulate ymm2
    vpaddd ymm1, ymm2
    add rsi, 16
    cmp rsi, rdi
    jl _vadd_loop

    ; Good practice (I believe?) after vex instruction mode (vpaddd ymm/zmm, etc) to execute this instruction
    ; otherwise there can be unpleasant penalties for sse instructions (ie paddd xmm... etc) going forward
    vzeroupper
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment