Skip to content

Instantly share code, notes, and snippets.

@marcoheisig
Last active December 16, 2019 12:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marcoheisig/1334e260195e42704824c69419a545e3 to your computer and use it in GitHub Desktop.
Save marcoheisig/1334e260195e42704824c69419a545e3 to your computer and use it in GitHub Desktop.
A demo of my WIP SIMD library: https://github.com/marcoheisig/sb-simd
(defun simd-sum (array &aux (n (array-total-size array)))
"Compute the sum of the elements of the supplied simple double-float ARRAY."
(declare (type (simple-array double-float 1) array)
(optimize speed (safety 0)))
(do ((index 0 (the (integer 0 #.(- array-total-size-limit 16)) (+ index 16)))
(acc1 (make-f64.4 0 0 0 0) (f64.4+ acc1 (f64.4-row-major-aref array (+ index 0))))
(acc2 (make-f64.4 0 0 0 0) (f64.4+ acc2 (f64.4-row-major-aref array (+ index 4))))
(acc3 (make-f64.4 0 0 0 0) (f64.4+ acc3 (f64.4-row-major-aref array (+ index 8))))
(acc4 (make-f64.4 0 0 0 0) (f64.4+ acc4 (f64.4-row-major-aref array (+ index 12)))))
((>= index (- n 16))
(do ((result (multiple-value-call #'+ (f64.4-values (f64.4+ acc1 acc2 acc3 acc4)))
(+ result (row-major-aref array index)))
(index index (1+ index)))
((>= index n) result)))))
; disassembly for simd-sum
; Size: 272 bytes. Origin: #x52B83286 ; simd-sum
; 286: 488B4AF9 mov RCX, [RDX-7]
; 28A: 31C0 xor EAX, EAX
; 28C: C5D457ED vxorps YMM5, YMM5, YMM5
; 290: C5F457C9 vxorps YMM1, YMM1, YMM1
; 294: C5EC57D2 vxorps YMM2, YMM2, YMM2
; 298: C5E457DB vxorps YMM3, YMM3, YMM3
; 29C: EB3D jmp L1
; 29E: 6690 nop
; 2A0: L0: 488D7020 lea RSI, [RAX+32]
; 2A4: C5FD10648201 vmovupd YMM4, [RDX+RAX*4+1]
; 2AA: C5D558EC vaddpd YMM5, YMM5, YMM4
; 2AE: 488D5808 lea RBX, [RAX+8]
; 2B2: C5FD10449A01 vmovupd YMM0, [RDX+RBX*4+1]
; 2B8: C5F558C8 vaddpd YMM1, YMM1, YMM0
; 2BC: 488D5810 lea RBX, [RAX+16]
; 2C0: C5FD10449A01 vmovupd YMM0, [RDX+RBX*4+1]
; 2C6: C5ED58D0 vaddpd YMM2, YMM2, YMM0
; 2CA: 4883C018 add RAX, 24
; 2CE: C5FD10448201 vmovupd YMM0, [RDX+RAX*4+1]
; 2D4: C5E558D8 vaddpd YMM3, YMM3, YMM0
; 2D8: 488BC6 mov RAX, RSI
; 2DB: L1: 488D59E0 lea RBX, [RCX-32]
; 2DF: 4839D8 cmp RAX, RBX
; 2E2: 7CBC jl L0
; 2E4: C5D558E9 vaddpd YMM5, YMM5, YMM1
; 2E8: C5D558EA vaddpd YMM5, YMM5, YMM2
; 2EC: C5D558EB vaddpd YMM5, YMM5, YMM3
; 2F0: C5FC28C5 vmovaps YMM0, YMM5
; 2F4: C5F157C9 vxorpd XMM1, XMM1, XMM1
; 2F8: F20F10C8 movsd XMM1, XMM0
; 2FC: C5FC28C5 vmovaps YMM0, YMM5
; 300: C5FD73D808 vpsrldq YMM0, YMM0, 8
; 305: C5E157DB vxorpd XMM3, XMM3, XMM3
; 309: F20F10D8 movsd XMM3, XMM0
; 30D: C4E37D19E801 vextractf128 YMM0, YMM5, 1
; 313: C5D957E4 vxorpd XMM4, XMM4, XMM4
; 317: F20F10E0 movsd XMM4, XMM0
; 31B: C4E37D19E801 vextractf128 YMM0, YMM5, 1
; 321: C5FD73D808 vpsrldq YMM0, YMM0, 8
; 326: C5D157ED vxorpd XMM5, XMM5, XMM5
; 32A: F20F10E8 movsd XMM5, XMM0
; 32E: F20F58D9 addsd XMM3, XMM1
; 332: F20F58E3 addsd XMM4, XMM3
; 336: F20F58EC addsd XMM5, XMM4
; 33A: EB1A jmp L3
; 33C: 0F1F4000 nop
; 340: L2: F20F10548201 movsd XMM2, [RDX+RAX*4+1]
; 346: 660F28CD movapd XMM1, XMM5
; 34A: F20F58CA addsd XMM1, XMM2
; 34E: 4883C002 add RAX, 2
; 352: 660F28E9 movapd XMM5, XMM1
; 356: L3: 4839C8 cmp RAX, RCX
; 359: 7CE5 jl L2
; 35B: 49896D28 mov [R13+40], RBP ; thread.pseudo-atomic-bits
; 35F: 498B5568 mov RDX, [R13+104] ; thread.alloc-region
; 363: 4C8D5A10 lea R11, [RDX+16]
; 367: 4D3B5D70 cmp R11, [R13+112]
; 36B: 771F jnbe L6
; 36D: 4D895D68 mov [R13+104], R11 ; thread.alloc-region
; 371: L4: 66C7021D01 mov word ptr [RDX], 285
; 376: 80CA0F or DL, 15
; 379: 49316D28 xor [R13+40], RBP ; thread.pseudo-atomic-bits
; 37D: 7402 jeq L5
; 37F: CC09 int3 9 ; pending interrupt trap
; 381: L5: F20F116AF9 movsd [RDX-7], XMM5
; 386: 488BE5 mov RSP, RBP
; 389: F8 clc
; 38A: 5D pop RBP
; 38B: C3 ret
; 38C: L6: 6A10 push 16
; 38E: E86DCE57FF call #x52100200 ; ALLOC-TRAMP
; 393: 5A pop RDX
; 394: EBDB jmp L4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment