Last active
December 16, 2019 12:05
-
-
Save marcoheisig/1334e260195e42704824c69419a545e3 to your computer and use it in GitHub Desktop.
A demo of my WIP SIMD library: https://github.com/marcoheisig/sb-simd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defun simd-sum (array &aux (n (array-total-size array))) | |
"Compute the sum of the elements of the supplied simple double-float ARRAY." | |
(declare (type (simple-array double-float 1) array) | |
(optimize speed (safety 0))) | |
(do ((index 0 (the (integer 0 #.(- array-total-size-limit 16)) (+ index 16))) | |
(acc1 (make-f64.4 0 0 0 0) (f64.4+ acc1 (f64.4-row-major-aref array (+ index 0)))) | |
(acc2 (make-f64.4 0 0 0 0) (f64.4+ acc2 (f64.4-row-major-aref array (+ index 4)))) | |
(acc3 (make-f64.4 0 0 0 0) (f64.4+ acc3 (f64.4-row-major-aref array (+ index 8)))) | |
(acc4 (make-f64.4 0 0 0 0) (f64.4+ acc4 (f64.4-row-major-aref array (+ index 12))))) | |
((>= index (- n 16)) | |
(do ((result (multiple-value-call #'+ (f64.4-values (f64.4+ acc1 acc2 acc3 acc4))) | |
(+ result (row-major-aref array index))) | |
(index index (1+ index))) | |
((>= index n) result))))) | |
; disassembly for simd-sum | |
; Size: 272 bytes. Origin: #x52B83286 ; simd-sum | |
; 286: 488B4AF9 mov RCX, [RDX-7] | |
; 28A: 31C0 xor EAX, EAX | |
; 28C: C5D457ED vxorps YMM5, YMM5, YMM5 | |
; 290: C5F457C9 vxorps YMM1, YMM1, YMM1 | |
; 294: C5EC57D2 vxorps YMM2, YMM2, YMM2 | |
; 298: C5E457DB vxorps YMM3, YMM3, YMM3 | |
; 29C: EB3D jmp L1 | |
; 29E: 6690 nop | |
; 2A0: L0: 488D7020 lea RSI, [RAX+32] | |
; 2A4: C5FD10648201 vmovupd YMM4, [RDX+RAX*4+1] | |
; 2AA: C5D558EC vaddpd YMM5, YMM5, YMM4 | |
; 2AE: 488D5808 lea RBX, [RAX+8] | |
; 2B2: C5FD10449A01 vmovupd YMM0, [RDX+RBX*4+1] | |
; 2B8: C5F558C8 vaddpd YMM1, YMM1, YMM0 | |
; 2BC: 488D5810 lea RBX, [RAX+16] | |
; 2C0: C5FD10449A01 vmovupd YMM0, [RDX+RBX*4+1] | |
; 2C6: C5ED58D0 vaddpd YMM2, YMM2, YMM0 | |
; 2CA: 4883C018 add RAX, 24 | |
; 2CE: C5FD10448201 vmovupd YMM0, [RDX+RAX*4+1] | |
; 2D4: C5E558D8 vaddpd YMM3, YMM3, YMM0 | |
; 2D8: 488BC6 mov RAX, RSI | |
; 2DB: L1: 488D59E0 lea RBX, [RCX-32] | |
; 2DF: 4839D8 cmp RAX, RBX | |
; 2E2: 7CBC jl L0 | |
; 2E4: C5D558E9 vaddpd YMM5, YMM5, YMM1 | |
; 2E8: C5D558EA vaddpd YMM5, YMM5, YMM2 | |
; 2EC: C5D558EB vaddpd YMM5, YMM5, YMM3 | |
; 2F0: C5FC28C5 vmovaps YMM0, YMM5 | |
; 2F4: C5F157C9 vxorpd XMM1, XMM1, XMM1 | |
; 2F8: F20F10C8 movsd XMM1, XMM0 | |
; 2FC: C5FC28C5 vmovaps YMM0, YMM5 | |
; 300: C5FD73D808 vpsrldq YMM0, YMM0, 8 | |
; 305: C5E157DB vxorpd XMM3, XMM3, XMM3 | |
; 309: F20F10D8 movsd XMM3, XMM0 | |
; 30D: C4E37D19E801 vextractf128 YMM0, YMM5, 1 | |
; 313: C5D957E4 vxorpd XMM4, XMM4, XMM4 | |
; 317: F20F10E0 movsd XMM4, XMM0 | |
; 31B: C4E37D19E801 vextractf128 YMM0, YMM5, 1 | |
; 321: C5FD73D808 vpsrldq YMM0, YMM0, 8 | |
; 326: C5D157ED vxorpd XMM5, XMM5, XMM5 | |
; 32A: F20F10E8 movsd XMM5, XMM0 | |
; 32E: F20F58D9 addsd XMM3, XMM1 | |
; 332: F20F58E3 addsd XMM4, XMM3 | |
; 336: F20F58EC addsd XMM5, XMM4 | |
; 33A: EB1A jmp L3 | |
; 33C: 0F1F4000 nop | |
; 340: L2: F20F10548201 movsd XMM2, [RDX+RAX*4+1] | |
; 346: 660F28CD movapd XMM1, XMM5 | |
; 34A: F20F58CA addsd XMM1, XMM2 | |
; 34E: 4883C002 add RAX, 2 | |
; 352: 660F28E9 movapd XMM5, XMM1 | |
; 356: L3: 4839C8 cmp RAX, RCX | |
; 359: 7CE5 jl L2 | |
; 35B: 49896D28 mov [R13+40], RBP ; thread.pseudo-atomic-bits | |
; 35F: 498B5568 mov RDX, [R13+104] ; thread.alloc-region | |
; 363: 4C8D5A10 lea R11, [RDX+16] | |
; 367: 4D3B5D70 cmp R11, [R13+112] | |
; 36B: 771F jnbe L6 | |
; 36D: 4D895D68 mov [R13+104], R11 ; thread.alloc-region | |
; 371: L4: 66C7021D01 mov word ptr [RDX], 285 | |
; 376: 80CA0F or DL, 15 | |
; 379: 49316D28 xor [R13+40], RBP ; thread.pseudo-atomic-bits | |
; 37D: 7402 jeq L5 | |
; 37F: CC09 int3 9 ; pending interrupt trap | |
; 381: L5: F20F116AF9 movsd [RDX-7], XMM5 | |
; 386: 488BE5 mov RSP, RBP | |
; 389: F8 clc | |
; 38A: 5D pop RBP | |
; 38B: C3 ret | |
; 38C: L6: 6A10 push 16 | |
; 38E: E86DCE57FF call #x52100200 ; ALLOC-TRAMP | |
; 393: 5A pop RDX | |
; 394: EBDB jmp L4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment