december1981/accumulate.s

## accumulate.s
; demo adding an array of u16s and accumulating them into a s32
; we load 8 bytes at a time (4 words) into a simd register and use simd instructions to accumulate.

; compile with nasm for elf64 target to executable "accum"
;; nasm -felf64 -o accum.o accum.s
;; ld --dynamic-linker /lib64/ld-linux-x86-64.so.2 accum.o -o accum -lc
;;


global _start
; standard libc printf
extern printf

section .data
message: db "Accumulated result: %#x", 10, 0

align 8
zero: dq 0x0
array: dw 0x1111, 0x2222, 0x3333, 0x4444, 0x1111, 0x8888, 0x9999, 0x9999, 0x1111, 0x2222, 0x7777, 0x4444, 0x1111, 0x3214, 0x9999, 0x4444, 0x1111
els: equ ($ - array)/2
; number of loops for the vectored calculation (process 4 words at a time)
vels: equ els / 4
; remainder items where we just process 1 word at a time (max 3 iterations)
rels: equ els - vels * 4

section .text
_start:
    mov rsi, array
    movq xmm1, [zero]

    ; clear top 32 bits of rcx in the process
    mov rcx, vels
    cmp ecx, 0
    je _add_remainder

    ; set our end offset to compare for loop
    lea rdi, [rcx * 8 + array]

    _vadd_loop:
    movq xmm0, [rsi]
    ; this will do the zero extension from words to doubles
    pmovzxwd xmm0, xmm0
    ; sum our words together in two of these horizontal add instructions
    phaddd xmm0, xmm0
    phaddd xmm0, xmm0
    ; loop accumulate lower s32 in xmm1
    paddd xmm1, xmm0
    add rsi, 8
    cmp rsi, rdi
    jl _vadd_loop

    _add_remainder:
    ; extract our accumulated s32 (will be ebx)
    movd ebx, xmm1

    ; add remainder elements using standard loop every 2 bytes
    ; this loop iteration will be at most 3 (from the specific data above, 1 iteration)
    mov ecx, rels
    cmp ecx, 0
    je _show_message

    _add_loop:
    movzx eax, word [rsi]
    add ebx, eax
    add rsi, 2
    loop _add_loop

    _show_message:
    lea rdi, [rel message]
    xor eax, eax
    mov esi, ebx
    call printf wrt ..plt

    ;; exit (sys call 60) with 0 return code.
    mov rax, 60
    mov edi, 0
    syscall
	; demo adding an array of u16s and accumulating them into a s32
	; we load 8 bytes at a time (4 words) into a simd register and use simd instructions to accumulate.

	; compile with nasm for elf64 target to executable "accum"
	;; nasm -felf64 -o accum.o accum.s
	;; ld --dynamic-linker /lib64/ld-linux-x86-64.so.2 accum.o -o accum -lc
	;;


	global _start
	; standard libc printf
	extern printf

	section .data
	message: db "Accumulated result: %#x", 10, 0

	align 8
	zero: dq 0x0
	array: dw 0x1111, 0x2222, 0x3333, 0x4444, 0x1111, 0x8888, 0x9999, 0x9999, 0x1111, 0x2222, 0x7777, 0x4444, 0x1111, 0x3214, 0x9999, 0x4444, 0x1111
	els: equ ($ - array)/2
	; number of loops for the vectored calculation (process 4 words at a time)
	vels: equ els / 4
	; remainder items where we just process 1 word at a time (max 3 iterations)
	rels: equ els - vels * 4

	section .text
	_start:
	mov rsi, array
	movq xmm1, [zero]

	; clear top 32 bits of rcx in the process
	mov rcx, vels
	cmp ecx, 0
	je _add_remainder

	; set our end offset to compare for loop
	lea rdi, [rcx * 8 + array]

	_vadd_loop:
	movq xmm0, [rsi]
	; this will do the zero extension from words to doubles
	pmovzxwd xmm0, xmm0
	; sum our words together in two of these horizontal add instructions
	phaddd xmm0, xmm0
	phaddd xmm0, xmm0
	; loop accumulate lower s32 in xmm1
	paddd xmm1, xmm0
	add rsi, 8
	cmp rsi, rdi
	jl _vadd_loop

	_add_remainder:
	; extract our accumulated s32 (will be ebx)
	movd ebx, xmm1

	; add remainder elements using standard loop every 2 bytes
	; this loop iteration will be at most 3 (from the specific data above, 1 iteration)
	mov ecx, rels
	cmp ecx, 0
	je _show_message

	_add_loop:
	movzx eax, word [rsi]
	add ebx, eax
	add rsi, 2
	loop _add_loop

	_show_message:
	lea rdi, [rel message]
	xor eax, eax
	mov esi, ebx
	call printf wrt ..plt

	;; exit (sys call 60) with 0 return code.
	mov rax, 60
	mov edi, 0
	syscall