Skip to content

Instantly share code, notes, and snippets.

@mayerrobert
Last active April 26, 2023 04:17
Show Gist options
  • Save mayerrobert/4c19600fa2ffc2bfda50b265723963b6 to your computer and use it in GitHub Desktop.
Save mayerrobert/4c19600fa2ffc2bfda50b265723963b6 to your computer and use it in GitHub Desktop.
SBCL 2.3.3 optimizer issue
;;; It seems that this file shows two issues with sbcl:
;;;
;;; (1) sbcl sometimes doesn't find optimization opportunities,
;;; it seems only SIMD code is affected.
;;; (2) Specifying ':fill-pointer nil' to 'make-array'
;;; seems to make a difference, i.e. specifying ':fill-pointer nil'
;;; (which doesn't make sense) considerably improves the generated code.
;;;
;;; Note how the hot loop starting at Windows:L10/ linux:L12 contains a lot of unneccessary
;;; "MOV [RBP-72], RCX" and "MOV RCX, [RBP-72]" instructions, or "MOV [RBP-48], RAX" twice in a row,
;;; neither are needed.
;;;
;;; After removing the ';' in line 90 (and therefore specifying ':fill-pointer nil')
;;; these extra instructions are optimized away, the hot loop starting at L9 is fast
;;;
;;; On Windows the innefficient compilation started with
;;;
;;;
;;; $ git bisect bad
;;; a2c61499aef4b6074a3cda3b694b688013a10389 is the first bad commit
;;; commit a2c61499aef4b6074a3cda3b694b688013a10389
;;; Author: Stas Boukarev <stassats@gmail.com>
;;; Date: Tue Nov 1 00:10:33 2022 +0300
;;;
;;; Enable immobile-space on windows.
;;;
;;; make-config.sh | 2 +-
;;; src/runtime/win32-os.c | 17 ++++++++++++++++-
;;; tests/gc-cardmark.impure.lisp | 3 ++-
;;; tests/x86-64-codegen.impure.lisp | 2 +-
;;; 4 files changed, 20 insertions(+), 4 deletions(-)
;;;
;;;
;;; I'm guessing that this commit was not the actual cause
;;; but rather brought a latent bug into effect.
;;; I was not able to figure out when the inefficient compilation startet on linux
;;; as sbcl 2.2.6 linux already had the undesired behaviour
;;; (and this is the first sbcl with SIMD support so I can't test with earlier versions).
;;;
;;; Reproducible with sbcl-2.3.3 x64 built with 'sh make.sh --dynamic-space-size=4Gb' on Windows 10
;;; as well as the current HEAD 2.3.3.85-c3a597322 on Windows10 as well as linux.
(declaim (optimize (speed 3) (compilation-speed 0) (safety 0)))
(eval-when (:compile-toplevel :load-toplevel :execute)
(require "sb-simd"))
(deftype array-index ()
`(integer 0 (,array-dimension-limit)))
(deftype array-element ()
`(single-float))
(deftype simd-vector ()
'(sb-simd-avx:f32.8))
(defmacro simd-vector (&rest args)
`(sb-simd-avx:f32.8 ,@args))
(defmacro simd-+ (vec val)
`(sb-simd-avx:f32.8+ ,vec ,val))
(defmacro simd-* (vec val)
`(sb-simd-avx:f32.8* ,vec ,val))
(defmacro simd-row-major-aref (vec &rest subscripts)
`(sb-simd-avx:f32.8-row-major-aref ,vec ,@subscripts))
(defconstant +iblock-size+ 16)
(defconstant +col-block-size+ 256)
(defconstant +simd-size+ (coerce 8 'fixnum))
(defconstant +unroll-simd+ 4)
(defun matrix-multiply-simd-unrolled-blocked (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
;:fill-pointer nil ; XXX
:initial-element 0.0)))
(declare (array-index m n p))
(do ((iblock 0 (+ iblock +iblock-size+)))
((>= iblock m))
(let ((ilimit (min m (+ iblock +iblock-size+))))
(do ((jblock 0 (+ jblock +col-block-size+)))
((>= jblock n))
(let ((jlimit (min n (+ jblock +col-block-size+))))
(dotimes (k p)
(declare (array-index k))
(do ((i iblock (1+ i)))
((>= i ilimit))
(declare (array-index i))
(let* ((tmp (aref a i k))
(vtmp (simd-vector tmp))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (simd-vector vtmp) (array-element tmp) (array-index result-idx b-idx jlimit))
(do ((j jblock (+ j (* +unroll-simd+ +simd-size+))))
((> j (- jlimit (* +unroll-simd+ +simd-size+)))
(do ((jj j (+ jj +simd-size+)))
((> jj (- jlimit +simd-size+))
(loop repeat (- jlimit jj) do
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))
(setf #10=(simd-row-major-aref result result-idx)
(simd-+ #10# (simd-* (simd-row-major-aref b b-idx) vtmp)))
(incf result-idx +simd-size+)
(incf b-idx +simd-size+)))
#.`(progn ,@(loop for u below +unroll-simd+
collect `(let ((tmpidx (+ (* ,u +simd-size+) result-idx)))
(declare (array-index tmpidx))
(setf (simd-row-major-aref result tmpidx)
(simd-+ (simd-row-major-aref result tmpidx)
(simd-* (simd-row-major-aref b (+ (* ,u +simd-size+) b-idx)) vtmp))))))
(incf result-idx (* +unroll-simd+ +simd-size+))
(incf b-idx (* +unroll-simd+ +simd-size+))))))))))
result))
(disassemble 'matrix-multiply-simd-unrolled-blocked)
(format t "Using: ~a ~a~%" (lisp-implementation-type) (lisp-implementation-version))
; disassembly for MATRIX-MULTIPLY-SIMD-UNROLLED-BLOCKED
; Size: 1253 bytes. Origin: #x228D7D74 ; MATRIX-MULTIPLY-SIMD-UNROLLED-BLOCKED
; 7D74: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 7D7B: E9A6000000 JMP L1
; 7D80: L0: 4C8945C0 MOV [RBP-64], R8
; 7D84: 4C897DC8 MOV [RBP-56], R15
; 7D88: 498BC7 MOV RAX, R15
; 7D8B: 498BC8 MOV RCX, R8
; 7D8E: 488D5C24F0 LEA RBX, [RSP-16]
; 7D93: 4883EC28 SUB RSP, 40
; 7D97: 488B1582FFFFFF MOV RDX, [RIP-126] ; '(AND
; (= ..))
; 7D9E: 488B3D83FFFFFF MOV RDI, [RIP-125] ; '(A B)
; 7DA5: 488B3584FFFFFF MOV RSI, [RIP-124] ; "Cannot multiply ~S by ~S."
; 7DAC: 488943F0 MOV [RBX-16], RAX
; 7DB0: 48894BE8 MOV [RBX-24], RCX
; 7DB4: B90A000000 MOV ECX, 10
; 7DB9: 48892B MOV [RBX], RBP
; 7DBC: 488BEB MOV RBP, RBX
; 7DBF: B8E2D92620 MOV EAX, #x2026D9E2 ; #<FDEFN SB-KERNEL:ASSERT-ERROR>
; 7DC4: FFD0 CALL RAX
; 7DC6: 4C8B7DC8 MOV R15, [RBP-56]
; 7DCA: 498BFF MOV RDI, R15
; 7DCD: 4883EC10 SUB RSP, 16
; 7DD1: 488B1560FFFFFF MOV RDX, [RIP-160] ; 'A
; 7DD8: B904000000 MOV ECX, 4
; 7DDD: 48892C24 MOV [RSP], RBP
; 7DE1: 488BEC MOV RBP, RSP
; 7DE4: B8C2DD2B20 MOV EAX, #x202BDDC2 ; #<FDEFN SB-IMPL::ASSERT-PROMPT>
; 7DE9: FFD0 CALL RAX
; 7DEB: 480F42E3 CMOVB RSP, RBX
; 7DEF: 4C8B45C0 MOV R8, [RBP-64]
; 7DF3: 4C8BFA MOV R15, RDX
; 7DF6: 4C897DC8 MOV [RBP-56], R15
; 7DFA: 498BF8 MOV RDI, R8
; 7DFD: 4883EC10 SUB RSP, 16
; 7E01: 488B1538FFFFFF MOV RDX, [RIP-200] ; 'B
; 7E08: B904000000 MOV ECX, 4
; 7E0D: 48892C24 MOV [RSP], RBP
; 7E11: 488BEC MOV RBP, RSP
; 7E14: B8C2DD2B20 MOV EAX, #x202BDDC2 ; #<FDEFN SB-IMPL::ASSERT-PROMPT>
; 7E19: FFD0 CALL RAX
; 7E1B: 480F42E3 CMOVB RSP, RBX
; 7E1F: 4C8B7DC8 MOV R15, [RBP-56]
; 7E23: 4C8BC2 MOV R8, RDX
; 7E26: L1: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 7E2D: 498B5731 MOV RDX, [R15+49]
; 7E31: 498B4829 MOV RCX, [R8+41]
; 7E35: 4839D1 CMP RCX, RDX
; 7E38: 0F8542FFFFFF JNE L0
; 7E3E: 498B7F29 MOV RDI, [R15+41]
; 7E42: 48897DF8 MOV [RBP-8], RDI
; 7E46: 4D8B4831 MOV R9, [R8+49]
; 7E4A: 4C894DF0 MOV [RBP-16], R9
; 7E4E: 498B4731 MOV RAX, [R15+49]
; 7E52: 488945E8 MOV [RBP-24], RAX
; 7E56: 488BD7 MOV RDX, RDI
; 7E59: 48D1FA SAR RDX, 1
; 7E5C: 490FAFD1 IMUL RDX, R9
; 7E60: 488D4A02 LEA RCX, [RDX+2]
; 7E64: 48D1F9 SAR RCX, 1
; 7E67: 4883E1FE AND RCX, -2
; 7E6B: 488D1C8D1F000000 LEA RBX, [RCX*4+31]
; 7E73: 4883E3F0 AND RBX, -16
; 7E77: 498B4570 MOV RAX, [R13+112] ; thread.mixed-tlab
; 7E7B: 480FC1D8 XADD RAX, RBX
; 7E7F: 493B4578 CMP RAX, [R13+120]
; 7E83: 0F87B0030000 JNBE L24
; 7E89: 49894570 MOV [R13+112], RAX ; thread.mixed-tlab
; 7E8D: L2: C603D1 MOV BYTE PTR [RBX], -47
; 7E90: 48895308 MOV [RBX+8], RDX
; 7E94: 80CB0F OR BL, 15
; 7E97: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 7E9E: 498B7570 MOV RSI, [R13+112] ; thread.mixed-tlab
; 7EA2: 488D4650 LEA RAX, [RSI+80]
; 7EA6: 493B4578 CMP RAX, [R13+120]
; 7EAA: 0F879A030000 JNBE L25
; 7EB0: 49894570 MOV [R13+112], RAX ; thread.mixed-tlab
; 7EB4: L3: 66C7068101 MOV WORD PTR [RSI], 385
; 7EB9: 4080CE0F OR SIL, 15
; 7EBD: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 7EC4: 488956F9 MOV [RSI-7], RDX
; 7EC8: 48895601 MOV [RSI+1], RDX
; 7ECC: 48895E09 MOV [RSI+9], RBX
; 7ED0: 48C7461917010220 MOV QWORD PTR [RSI+25], #x20020117 ; NIL
; 7ED8: 48C7462117010220 MOV QWORD PTR [RSI+33], #x20020117 ; NIL
; 7EE0: 48897E29 MOV [RSI+41], RDI
; 7EE4: 4C894E31 MOV [RSI+49], R9
; 7EE8: 31C0 XOR EAX, EAX
; 7EEA: 488945E0 MOV [RBP-32], RAX
; 7EEE: E912030000 JMP L20
; 7EF3: 660F1F840000000000 NOP
; 7EFC: 0F1F4000 NOP
; 7F00: L4: 488B4DE0 MOV RCX, [RBP-32]
; 7F04: 48D1E1 SHL RCX, 1
; 7F07: 483B4DF8 CMP RCX, [RBP-8]
; 7F0B: 0F8D09030000 JNL L21
; 7F11: 488B4DE0 MOV RCX, [RBP-32]
; 7F15: 4883C110 ADD RCX, 16
; 7F19: 488BC1 MOV RAX, RCX
; 7F1C: 48C1E83E SHR RAX, 62
; 7F20: 0F8506030000 JNE L23
; 7F26: 488BD1 MOV RDX, RCX
; 7F29: 48D1E2 SHL RDX, 1
; 7F2C: 483955F8 CMP [RBP-8], RDX
; 7F30: 0F8EF6020000 JLE L23
; 7F36: 48D1E1 SHL RCX, 1
; 7F39: 48894DD8 MOV [RBP-40], RCX
; 7F3D: L5: 4531F6 XOR R14D, R14D
; 7F40: E9A0020000 JMP L18
; 7F45: 660F1F840000000000 NOP
; 7F4E: 6690 NOP
; 7F50: L6: 4B8D0C36 LEA RCX, [R14+R14]
; 7F54: 483B4DF0 CMP RCX, [RBP-16]
; 7F58: 0F8D9B020000 JNL L19
; 7F5E: 498BCE MOV RCX, R14
; 7F61: 4881C100010000 ADD RCX, 256
; 7F68: 488BC1 MOV RAX, RCX
; 7F6B: 48C1E83E SHR RAX, 62
; 7F6F: 0F85AE020000 JNE L22
; 7F75: 488BD1 MOV RDX, RCX
; 7F78: 48D1E2 SHL RDX, 1
; 7F7B: 483955F0 CMP [RBP-16], RDX
; 7F7F: 0F8E9E020000 JLE L22
; 7F85: 4C8D1C09 LEA R11, [RCX+RCX]
; 7F89: L7: 4531D2 XOR R10D, R10D
; 7F8C: E936020000 JMP L17
; 7F91: 660F1F840000000000 NOP
; 7F9A: 660F1F440000 NOP
; 7FA0: L8: 488B45E0 MOV RAX, [RBP-32]
; 7FA4: 48D1E0 SHL RAX, 1
; 7FA7: 488BF8 MOV RDI, RAX
; 7FAA: E903020000 JMP L16
; 7FAF: 90 NOP
; 7FB0: L9: 498B4F31 MOV RCX, [R15+49]
; 7FB4: 48D1F9 SAR RCX, 1
; 7FB7: 480FAFCF IMUL RCX, RDI
; 7FBB: 4C01D1 ADD RCX, R10
; 7FBE: 498B4709 MOV RAX, [R15+9]
; 7FC2: F30F10444801 MOVSS XMM0, [RAX+RCX*2+1]
; 7FC8: 0F28D8 MOVAPS XMM3, XMM0
; 7FCB: C4E27D18D0 VBROADCASTSS YMM2, XMM0
; 7FD0: 488B4E31 MOV RCX, [RSI+49]
; 7FD4: 48D1F9 SAR RCX, 1
; 7FD7: 480FAFCF IMUL RCX, RDI
; 7FDB: 488BD1 MOV RDX, RCX
; 7FDE: 498B4831 MOV RCX, [R8+49]
; 7FE2: 48D1F9 SAR RCX, 1
; 7FE5: 490FAFCA IMUL RCX, R10
; 7FE9: 488BD9 MOV RBX, RCX
; 7FEC: 4B8D0436 LEA RAX, [R14+R14]
; 7FF0: 4C8BC8 MOV R9, RAX
; 7FF3: E912010000 JMP L11
; 7FF8: 0F1F840000000000 NOP
; 8000: L10: 488B4609 MOV RAX, [RSI+9]
; 8004: C5FC104C5001 VMOVUPS YMM1, [RAX+RDX*2+1]
; 800A: 498B4009 MOV RAX, [R8+9]
; 800E: C5FC10445801 VMOVUPS YMM0, [RAX+RBX*2+1]
; 8014: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 8018: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 801C: 488B4609 MOV RAX, [RSI+9]
; 8020: C5FC11445001 VMOVUPS [RAX+RDX*2+1], YMM0
; 8026: 488D4A10 LEA RCX, [RDX+16]
; 802A: 48894DB8 MOV [RBP-72], RCX
; 802E: 488B4609 MOV RAX, [RSI+9]
; 8032: 488B4DB8 MOV RCX, [RBP-72]
; 8036: C5FC104C4801 VMOVUPS YMM1, [RAX+RCX*2+1]
; 803C: 488D4310 LEA RAX, [RBX+16]
; 8040: 488945D0 MOV [RBP-48], RAX
; 8044: 488945D0 MOV [RBP-48], RAX
; 8048: 498B4009 MOV RAX, [R8+9]
; 804C: 488B4DD0 MOV RCX, [RBP-48]
; 8050: C5FC10444801 VMOVUPS YMM0, [RAX+RCX*2+1]
; 8056: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 805A: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 805E: 488B4609 MOV RAX, [RSI+9]
; 8062: 488B4DB8 MOV RCX, [RBP-72]
; 8066: C5FC11444801 VMOVUPS [RAX+RCX*2+1], YMM0
; 806C: 488D4A20 LEA RCX, [RDX+32]
; 8070: 48894DB8 MOV [RBP-72], RCX
; 8074: 488B4609 MOV RAX, [RSI+9]
; 8078: 488B4DB8 MOV RCX, [RBP-72]
; 807C: C5FC104C4801 VMOVUPS YMM1, [RAX+RCX*2+1]
; 8082: 488D4320 LEA RAX, [RBX+32]
; 8086: 488945D0 MOV [RBP-48], RAX
; 808A: 488945D0 MOV [RBP-48], RAX
; 808E: 498B4009 MOV RAX, [R8+9]
; 8092: 488B4DD0 MOV RCX, [RBP-48]
; 8096: C5FC10444801 VMOVUPS YMM0, [RAX+RCX*2+1]
; 809C: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 80A0: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 80A4: 488B4609 MOV RAX, [RSI+9]
; 80A8: 488B4DB8 MOV RCX, [RBP-72]
; 80AC: C5FC11444801 VMOVUPS [RAX+RCX*2+1], YMM0
; 80B2: 488D4A30 LEA RCX, [RDX+48]
; 80B6: 48894DB8 MOV [RBP-72], RCX
; 80BA: 488B4609 MOV RAX, [RSI+9]
; 80BE: 488B4DB8 MOV RCX, [RBP-72]
; 80C2: C5FC104C4801 VMOVUPS YMM1, [RAX+RCX*2+1]
; 80C8: 488D4330 LEA RAX, [RBX+48]
; 80CC: 488945D0 MOV [RBP-48], RAX
; 80D0: 488945D0 MOV [RBP-48], RAX
; 80D4: 498B4009 MOV RAX, [R8+9]
; 80D8: 488B4DD0 MOV RCX, [RBP-48]
; 80DC: C5FC10444801 VMOVUPS YMM0, [RAX+RCX*2+1]
; 80E2: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 80E6: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 80EA: 488B4609 MOV RAX, [RSI+9]
; 80EE: 488B4DB8 MOV RCX, [RBP-72]
; 80F2: C5FC11444801 VMOVUPS [RAX+RCX*2+1], YMM0
; 80F8: 488D4A40 LEA RCX, [RDX+64]
; 80FC: 488BD1 MOV RDX, RCX
; 80FF: 488D4B40 LEA RCX, [RBX+64]
; 8103: 488BD9 MOV RBX, RCX
; 8106: 4983C140 ADD R9, 64
; 810A: L11: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 8111: 498D4BC0 LEA RCX, [R11-64]
; 8115: 4939C9 CMP R9, RCX
; 8118: 0F8EE2FEFFFF JLE L10
; 811E: EB38 JMP L13
; 8120: L12: 488B4609 MOV RAX, [RSI+9]
; 8124: C5FC104C5001 VMOVUPS YMM1, [RAX+RDX*2+1]
; 812A: 498B4009 MOV RAX, [R8+9]
; 812E: C5FC10445801 VMOVUPS YMM0, [RAX+RBX*2+1]
; 8134: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 8138: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 813C: 488B4609 MOV RAX, [RSI+9]
; 8140: C5FC11445001 VMOVUPS [RAX+RDX*2+1], YMM0
; 8146: 488D4A10 LEA RCX, [RDX+16]
; 814A: 488BD1 MOV RDX, RCX
; 814D: 488D4B10 LEA RCX, [RBX+16]
; 8151: 488BD9 MOV RBX, RCX
; 8154: 4983C110 ADD R9, 16
; 8158: L13: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 815F: 498D4BF0 LEA RCX, [R11-16]
; 8163: 4939C9 CMP R9, RCX
; 8166: 7EB8 JLE L12
; 8168: 498BCB MOV RCX, R11
; 816B: 4C29C9 SUB RCX, R9
; 816E: EB32 JMP L15
; 8170: L14: 4883C1FE ADD RCX, -2
; 8174: 498B4009 MOV RAX, [R8+9]
; 8178: F30F104C5801 MOVSS XMM1, [RAX+RBX*2+1]
; 817E: F30F59CB MULSS XMM1, XMM3
; 8182: 488B4609 MOV RAX, [RSI+9]
; 8186: F30F10545001 MOVSS XMM2, [RAX+RDX*2+1]
; 818C: F30F58D1 ADDSS XMM2, XMM1
; 8190: 488B4609 MOV RAX, [RSI+9]
; 8194: F30F11545001 MOVSS [RAX+RDX*2+1], XMM2
; 819A: 4883C202 ADD RDX, 2
; 819E: 4883C302 ADD RBX, 2
; 81A2: L15: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 81A9: 4885C9 TEST RCX, RCX
; 81AC: 7FC2 JNLE L14
; 81AE: 4883C702 ADD RDI, 2
; 81B2: L16: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 81B9: 483B7DD8 CMP RDI, [RBP-40]
; 81BD: 0F8CEDFDFFFF JL L9
; 81C3: 4983C202 ADD R10, 2
; 81C7: L17: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 81CE: 4C3B55E8 CMP R10, [RBP-24]
; 81D2: 0F8CC8FDFFFF JL L8
; 81D8: 498BCE MOV RCX, R14
; 81DB: 4881C100010000 ADD RCX, 256
; 81E2: 4C8BF1 MOV R14, RCX
; 81E5: L18: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 81EC: 498BC6 MOV RAX, R14
; 81EF: 48C1E83E SHR RAX, 62
; 81F3: 0F8457FDFFFF JEQ L6
; 81F9: L19: 488B4DE0 MOV RCX, [RBP-32]
; 81FD: 4883C110 ADD RCX, 16
; 8201: 48894DE0 MOV [RBP-32], RCX
; 8205: L20: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 820C: 488B45E0 MOV RAX, [RBP-32]
; 8210: 48C1E83E SHR RAX, 62
; 8214: 0F84E6FCFFFF JEQ L4
; 821A: L21: 488BD6 MOV RDX, RSI
; 821D: 488BE5 MOV RSP, RBP
; 8220: F8 CLC
; 8221: 5D POP RBP
; 8222: C3 RET
; 8223: L22: 4C8B5DF0 MOV R11, [RBP-16]
; 8227: E95DFDFFFF JMP L7
; 822C: L23: 488B45F8 MOV RAX, [RBP-8]
; 8230: 488945D8 MOV [RBP-40], RAX
; 8234: E904FDFFFF JMP L5
; 8239: L24: 4829D8 SUB RAX, RBX
; 823C: 50 PUSH RAX
; 823D: FF142528050220 CALL [#x20020528] ; #x21A204F0: ALLOC-TRAMP
; 8244: 5B POP RBX
; 8245: E943FCFFFF JMP L2
; 824A: L25: 6A50 PUSH 80
; 824C: FF142528050220 CALL [#x20020528] ; #x21A204F0: ALLOC-TRAMP
; 8253: 5E POP RSI
; 8254: E95BFCFFFF JMP L3
; disassembly for MATRIX-MULTIPLY-SIMD-UNROLLED-BLOCKED
; Size: 1171 bytes. Origin: #x228D7D95 ; MATRIX-MULTIPLY-SIMD-UNROLLED-BLOCKED
; 7D95: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 7D9C: E9A8000000 JMP L1
; 7DA1: 660F1F840000000000 NOP
; 7DAA: 660F1F440000 NOP
; 7DB0: L0: 4C8945C8 MOV [RBP-56], R8
; 7DB4: 488B45F8 MOV RAX, [RBP-8]
; 7DB8: 498BC8 MOV RCX, R8
; 7DBB: 488D5C24F0 LEA RBX, [RSP-16]
; 7DC0: 4883EC28 SUB RSP, 40
; 7DC4: 488B155DFFFFFF MOV RDX, [RIP-163] ; '(AND
; (= ..))
; 7DCB: 488B3D5EFFFFFF MOV RDI, [RIP-162] ; '(A B)
; 7DD2: 488B355FFFFFFF MOV RSI, [RIP-161] ; "Cannot multiply ~S by ~S."
; 7DD9: 488943F0 MOV [RBX-16], RAX
; 7DDD: 48894BE8 MOV [RBX-24], RCX
; 7DE1: B90A000000 MOV ECX, 10
; 7DE6: 48892B MOV [RBX], RBP
; 7DE9: 488BEB MOV RBP, RBX
; 7DEC: B8E2D92620 MOV EAX, #x2026D9E2 ; #<FDEFN SB-KERNEL:ASSERT-ERROR>
; 7DF1: FFD0 CALL RAX
; 7DF3: 488B7DF8 MOV RDI, [RBP-8]
; 7DF7: 4883EC10 SUB RSP, 16
; 7DFB: 488B153EFFFFFF MOV RDX, [RIP-194] ; 'A
; 7E02: B904000000 MOV ECX, 4
; 7E07: 48892C24 MOV [RSP], RBP
; 7E0B: 488BEC MOV RBP, RSP
; 7E0E: B8C2DD2B20 MOV EAX, #x202BDDC2 ; #<FDEFN SB-IMPL::ASSERT-PROMPT>
; 7E13: FFD0 CALL RAX
; 7E15: 480F42E3 CMOVB RSP, RBX
; 7E19: 4C8B45C8 MOV R8, [RBP-56]
; 7E1D: 488955F8 MOV [RBP-8], RDX
; 7E21: 498BF8 MOV RDI, R8
; 7E24: 4883EC10 SUB RSP, 16
; 7E28: 488B1519FFFFFF MOV RDX, [RIP-231] ; 'B
; 7E2F: B904000000 MOV ECX, 4
; 7E34: 48892C24 MOV [RSP], RBP
; 7E38: 488BEC MOV RBP, RSP
; 7E3B: B8C2DD2B20 MOV EAX, #x202BDDC2 ; #<FDEFN SB-IMPL::ASSERT-PROMPT>
; 7E40: FFD0 CALL RAX
; 7E42: 480F42E3 CMOVB RSP, RBX
; 7E46: 4C8BC2 MOV R8, RDX
; 7E49: L1: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 7E50: 488B45F8 MOV RAX, [RBP-8]
; 7E54: 488B5031 MOV RDX, [RAX+49]
; 7E58: 498B4829 MOV RCX, [R8+41]
; 7E5C: 4839D1 CMP RCX, RDX
; 7E5F: 0F854BFFFFFF JNE L0
; 7E65: 4C8945C8 MOV [RBP-56], R8
; 7E69: 488B45F8 MOV RAX, [RBP-8]
; 7E6D: 488B4829 MOV RCX, [RAX+41]
; 7E71: 48894DF0 MOV [RBP-16], RCX
; 7E75: 498B5831 MOV RBX, [R8+49]
; 7E79: 48895DE8 MOV [RBP-24], RBX
; 7E7D: 488B45F8 MOV RAX, [RBP-8]
; 7E81: 488B4031 MOV RAX, [RAX+49]
; 7E85: 488945E0 MOV [RBP-32], RAX
; 7E89: 498B5558 MOV RDX, [R13+88] ; thread.cons-tlab
; 7E8D: 488D4220 LEA RAX, [RDX+32]
; 7E91: 493B4560 CMP RAX, [R13+96]
; 7E95: 0F877E030000 JNBE L23
; 7E9B: 49894558 MOV [R13+88], RAX ; thread.cons-tlab
; 7E9F: L2: 48890A MOV [RDX], RCX
; 7EA2: 48895A10 MOV [RDX+16], RBX
; 7EA6: 48C7421817010220 MOV QWORD PTR [RDX+24], #x20020117 ; NIL
; 7EAE: 488D4217 LEA RAX, [RDX+23]
; 7EB2: 48894208 MOV [RDX+8], RAX
; 7EB6: 80CA07 OR DL, 7
; 7EB9: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 7EC0: 488D5C24F0 LEA RBX, [RSP-16]
; 7EC5: 4883EC38 SUB RSP, 56
; 7EC9: BFA2010000 MOV EDI, 418
; 7ECE: BE0A000000 MOV ESI, 10
; 7ED3: 48C743F09F691420 MOV QWORD PTR [RBX-16], #x2014699F ; ':ADJUSTABLE
; 7EDB: 48C743E817010220 MOV QWORD PTR [RBX-24], #x20020117 ; NIL
; 7EE3: 48C743E0CF691420 MOV QWORD PTR [RBX-32], #x201469CF ; ':FILL-POINTER
; 7EEB: 48C743D817010220 MOV QWORD PTR [RBX-40], #x20020117 ; NIL
; 7EF3: B90E000000 MOV ECX, 14
; 7EF8: 48892B MOV [RBX], RBP
; 7EFB: 488BEB MOV RBP, RBX
; 7EFE: B882BF2820 MOV EAX, #x2028BF82 ; #<FDEFN SB-KERNEL:%MAKE-ARRAY>
; 7F03: FFD0 CALL RAX
; 7F05: 4C8B45C8 MOV R8, [RBP-56]
; 7F09: 488BF2 MOV RSI, RDX
; 7F0C: 31C0 XOR EAX, EAX
; 7F0E: 488945D8 MOV [RBP-40], RAX
; 7F12: E9CE020000 JMP L19
; 7F17: 660F1F840000000000 NOP
; 7F20: L3: 488B4DD8 MOV RCX, [RBP-40]
; 7F24: 48D1E1 SHL RCX, 1
; 7F27: 483B4DF0 CMP RCX, [RBP-16]
; 7F2B: 0F8DC9020000 JNL L20
; 7F31: 488B4DD8 MOV RCX, [RBP-40]
; 7F35: 4883C110 ADD RCX, 16
; 7F39: 488BC1 MOV RAX, RCX
; 7F3C: 48C1E83E SHR RAX, 62
; 7F40: 0F85C6020000 JNE L22
; 7F46: 488BD1 MOV RDX, RCX
; 7F49: 48D1E2 SHL RDX, 1
; 7F4C: 483955F0 CMP [RBP-16], RDX
; 7F50: 0F8EB6020000 JLE L22
; 7F56: 48D1E1 SHL RCX, 1
; 7F59: 48894DD0 MOV [RBP-48], RCX
; 7F5D: L4: 4531F6 XOR R14D, R14D
; 7F60: E960020000 JMP L17
; 7F65: 660F1F840000000000 NOP
; 7F6E: 6690 NOP
; 7F70: L5: 4B8D0C36 LEA RCX, [R14+R14]
; 7F74: 483B4DE8 CMP RCX, [RBP-24]
; 7F78: 0F8D5B020000 JNL L18
; 7F7E: 498BCE MOV RCX, R14
; 7F81: 4881C100010000 ADD RCX, 256
; 7F88: 488BC1 MOV RAX, RCX
; 7F8B: 48C1E83E SHR RAX, 62
; 7F8F: 0F856E020000 JNE L21
; 7F95: 488BD1 MOV RDX, RCX
; 7F98: 48D1E2 SHL RDX, 1
; 7F9B: 483955E8 CMP [RBP-24], RDX
; 7F9F: 0F8E5E020000 JLE L21
; 7FA5: 4C8D1C09 LEA R11, [RCX+RCX]
; 7FA9: L6: 4531D2 XOR R10D, R10D
; 7FAC: E9F6010000 JMP L16
; 7FB1: 660F1F840000000000 NOP
; 7FBA: 660F1F440000 NOP
; 7FC0: L7: 488B45D8 MOV RAX, [RBP-40]
; 7FC4: 48D1E0 SHL RAX, 1
; 7FC7: 488BF8 MOV RDI, RAX
; 7FCA: E9C3010000 JMP L15
; 7FCF: 90 NOP
; 7FD0: L8: 488B45F8 MOV RAX, [RBP-8]
; 7FD4: 488B4831 MOV RCX, [RAX+49]
; 7FD8: 48D1F9 SAR RCX, 1
; 7FDB: 480FAFCF IMUL RCX, RDI
; 7FDF: 4C01D1 ADD RCX, R10
; 7FE2: 488B45F8 MOV RAX, [RBP-8]
; 7FE6: 488B4009 MOV RAX, [RAX+9]
; 7FEA: F30F10444801 MOVSS XMM0, [RAX+RCX*2+1]
; 7FF0: 0F28D8 MOVAPS XMM3, XMM0
; 7FF3: C4E27D18D0 VBROADCASTSS YMM2, XMM0
; 7FF8: 488B4E31 MOV RCX, [RSI+49]
; 7FFC: 48D1F9 SAR RCX, 1
; 7FFF: 480FAFCF IMUL RCX, RDI
; 8003: 488BD1 MOV RDX, RCX
; 8006: 498B4831 MOV RCX, [R8+49]
; 800A: 48D1F9 SAR RCX, 1
; 800D: 490FAFCA IMUL RCX, R10
; 8011: 488BD9 MOV RBX, RCX
; 8014: 4B8D0436 LEA RAX, [R14+R14]
; 8018: 4C8BC8 MOV R9, RAX
; 801B: E9C8000000 JMP L10
; 8020: L9: 488B4609 MOV RAX, [RSI+9]
; 8024: C5FC104C5001 VMOVUPS YMM1, [RAX+RDX*2+1]
; 802A: 498B4009 MOV RAX, [R8+9]
; 802E: C5FC10445801 VMOVUPS YMM0, [RAX+RBX*2+1]
; 8034: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 8038: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 803C: 488B4609 MOV RAX, [RSI+9]
; 8040: C5FC11445001 VMOVUPS [RAX+RDX*2+1], YMM0
; 8046: 4C8D7A10 LEA R15, [RDX+16]
; 804A: 488B4609 MOV RAX, [RSI+9]
; 804E: C4A17C104C7801 VMOVUPS YMM1, [RAX+R15*2+1]
; 8055: 488D4B10 LEA RCX, [RBX+16]
; 8059: 498B4009 MOV RAX, [R8+9]
; 805D: C5FC10444801 VMOVUPS YMM0, [RAX+RCX*2+1]
; 8063: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 8067: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 806B: 488B4609 MOV RAX, [RSI+9]
; 806F: C4A17C11447801 VMOVUPS [RAX+R15*2+1], YMM0
; 8076: 4C8D7A20 LEA R15, [RDX+32]
; 807A: 488B4609 MOV RAX, [RSI+9]
; 807E: C4A17C104C7801 VMOVUPS YMM1, [RAX+R15*2+1]
; 8085: 488D4B20 LEA RCX, [RBX+32]
; 8089: 498B4009 MOV RAX, [R8+9]
; 808D: C5FC10444801 VMOVUPS YMM0, [RAX+RCX*2+1]
; 8093: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 8097: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 809B: 488B4609 MOV RAX, [RSI+9]
; 809F: C4A17C11447801 VMOVUPS [RAX+R15*2+1], YMM0
; 80A6: 4C8D7A30 LEA R15, [RDX+48]
; 80AA: 488B4609 MOV RAX, [RSI+9]
; 80AE: C4A17C104C7801 VMOVUPS YMM1, [RAX+R15*2+1]
; 80B5: 488D4B30 LEA RCX, [RBX+48]
; 80B9: 498B4009 MOV RAX, [R8+9]
; 80BD: C5FC10444801 VMOVUPS YMM0, [RAX+RCX*2+1]
; 80C3: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 80C7: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 80CB: 488B4609 MOV RAX, [RSI+9]
; 80CF: C4A17C11447801 VMOVUPS [RAX+R15*2+1], YMM0
; 80D6: 488D4A40 LEA RCX, [RDX+64]
; 80DA: 488BD1 MOV RDX, RCX
; 80DD: 488D4B40 LEA RCX, [RBX+64]
; 80E1: 488BD9 MOV RBX, RCX
; 80E4: 4983C140 ADD R9, 64
; 80E8: L10: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 80EF: 498D4BC0 LEA RCX, [R11-64]
; 80F3: 4939C9 CMP R9, RCX
; 80F6: 0F8E24FFFFFF JLE L9
; 80FC: EB3A JMP L12
; 80FE: 6690 NOP
; 8100: L11: 488B4609 MOV RAX, [RSI+9]
; 8104: C5FC104C5001 VMOVUPS YMM1, [RAX+RDX*2+1]
; 810A: 498B4009 MOV RAX, [R8+9]
; 810E: C5FC10445801 VMOVUPS YMM0, [RAX+RBX*2+1]
; 8114: C5FC59C2 VMULPS YMM0, YMM0, YMM2
; 8118: C5F458C0 VADDPS YMM0, YMM1, YMM0
; 811C: 488B4609 MOV RAX, [RSI+9]
; 8120: C5FC11445001 VMOVUPS [RAX+RDX*2+1], YMM0
; 8126: 488D4A10 LEA RCX, [RDX+16]
; 812A: 488BD1 MOV RDX, RCX
; 812D: 488D4B10 LEA RCX, [RBX+16]
; 8131: 488BD9 MOV RBX, RCX
; 8134: 4983C110 ADD R9, 16
; 8138: L12: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 813F: 498D4BF0 LEA RCX, [R11-16]
; 8143: 4939C9 CMP R9, RCX
; 8146: 7EB8 JLE L11
; 8148: 498BCB MOV RCX, R11
; 814B: 4C29C9 SUB RCX, R9
; 814E: EB32 JMP L14
; 8150: L13: 4883C1FE ADD RCX, -2
; 8154: 498B4009 MOV RAX, [R8+9]
; 8158: F30F104C5801 MOVSS XMM1, [RAX+RBX*2+1]
; 815E: F30F59CB MULSS XMM1, XMM3
; 8162: 488B4609 MOV RAX, [RSI+9]
; 8166: F30F10545001 MOVSS XMM2, [RAX+RDX*2+1]
; 816C: F30F58D1 ADDSS XMM2, XMM1
; 8170: 488B4609 MOV RAX, [RSI+9]
; 8174: F30F11545001 MOVSS [RAX+RDX*2+1], XMM2
; 817A: 4883C202 ADD RDX, 2
; 817E: 4883C302 ADD RBX, 2
; 8182: L14: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 8189: 4885C9 TEST RCX, RCX
; 818C: 7FC2 JNLE L13
; 818E: 4883C702 ADD RDI, 2
; 8192: L15: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 8199: 483B7DD0 CMP RDI, [RBP-48]
; 819D: 0F8C2DFEFFFF JL L8
; 81A3: 4983C202 ADD R10, 2
; 81A7: L16: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 81AE: 4C3B55E0 CMP R10, [RBP-32]
; 81B2: 0F8C08FEFFFF JL L7
; 81B8: 498BCE MOV RCX, R14
; 81BB: 4881C100010000 ADD RCX, 256
; 81C2: 4C8BF1 MOV R14, RCX
; 81C5: L17: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 81CC: 498BC6 MOV RAX, R14
; 81CF: 48C1E83E SHR RAX, 62
; 81D3: 0F8497FDFFFF JEQ L5
; 81D9: L18: 488B4DD8 MOV RCX, [RBP-40]
; 81DD: 4883C110 ADD RCX, 16
; 81E1: 48894DD8 MOV [RBP-40], RCX
; 81E5: L19: 840425F8FF0120 TEST AL, [#x2001FFF8] ; safepoint
; 81EC: 488B45D8 MOV RAX, [RBP-40]
; 81F0: 48C1E83E SHR RAX, 62
; 81F4: 0F8426FDFFFF JEQ L3
; 81FA: L20: 488BD6 MOV RDX, RSI
; 81FD: 488BE5 MOV RSP, RBP
; 8200: F8 CLC
; 8201: 5D POP RBP
; 8202: C3 RET
; 8203: L21: 4C8B5DE8 MOV R11, [RBP-24]
; 8207: E99DFDFFFF JMP L6
; 820C: L22: 488B45F0 MOV RAX, [RBP-16]
; 8210: 488945D0 MOV [RBP-48], RAX
; 8214: E944FDFFFF JMP L4
; 8219: L23: 6A20 PUSH 32
; 821B: FF142538050220 CALL [#x20020538] ; #x21A205B0: LIST-ALLOC-TRAMP
; 8222: 5A POP RDX
; 8223: E977FCFFFF JMP L2
@csrhodes
Copy link

This is definitely worth reporting to the mailing list, or as a bug on launchpad.

:fill-pointer on a non-vector does not make sense. The presence of :fill-pointer nil, even though it looks like it's the same as not providing :fill-pointer at all, inhibits one of SBCL's make-array transformations. That inhibition probably does something to the downstream type derivation or representation selection -- it might be something that we can't directly change (e.g. memory alignment) but it might be that we're doing something actually wrong somewhere else.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment