Skip to content

Instantly share code, notes, and snippets.

@mayerrobert
Last active July 22, 2023 21:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mayerrobert/913b4c26103c614f9517360a4f00286a to your computer and use it in GitHub Desktop.
Save mayerrobert/913b4c26103c614f9517360a4f00286a to your computer and use it in GitHub Desktop.
Matrix multiplication optimization experiments with SB-SIMD
;;;; Matrix multiplication optimization experiments
;;;
;;; This file contains functions to multiply two matrices written in Common Lisp.
;;;
;;; DISCLAIMER: while this code has some tests it might contains bugs,
;;; I made it to experiment with SBCL's SIMD support.
;;;
;;; All functions use the naive nested loop algorithm which is O(n^3).
;;; This file contains 5 functions with various optimizations
;;; that should be portable Common Lisp and 5 functions that use sbcl's SB-SIMD.
;;; (the last SIMD function 'matrix-multiply-simd-unrolled-blocked2' is not
;;; that useful, though, leaving 4 SIMD functions)
;;;
;;; All 10 functions should produce the same result, the multiplication of two matrices.
;;;
;;; Execution times with sbcl-2.3.3 for multiplying
;;; two single-float (1000x1000) x (1000x1000) matrices range from:
;;;
;;; - 1.6 seconds for 'matrix-multiply' through
;;; - 0.73 seconds for 'matrix-multiply-unrolled-blocked' (fastest portable version) down to
;;; - 0.125 seconds for 'matrix-multiply-simd-unrolled-blocked'.
;;;
;;; and for multiplying two single-float (5000x2000) x (2000x5000) matrices range from:
;;;
;;; - 85 seconds for 'matrix-multiply' through
;;; - 39 seconds for 'matrix-multiply-unrolled-blocked' (fastest portable version) down to
;;; - 6.4 seconds for 'matrix-multiply-simd-unrolled-blocked'.
;;;
;;; Optimizations include loop unrolling, cache blocking and SIMD instructions.
;;;
;;;
;;; a is an m x p matrix, b is an p x n matrix, result is an m x n matrix.
;;;
;;; matrix-multiplication ::= for k=1..p, i=1..m, j=1..n
;;; result(i,j) := sum(a(i,k) * b(k,j))
;;;
;;;
;;; Run with
;;; sbcl --script mmult-simd.lisp
;;;
;;; To use all provided functions you need sbcl-x64 2.2.6+
;;; and a CPU that supports the selected instruction set.
;;;
;;; Other Common Lisp implementations will only be able to use the 5 portable functions.
;;;
;;; E.g. on abcl run with:
;;; C:\> abcl --batch --eval "(compile-file \"mmult-simd.lisp\")"
;;; C:\> abcl --batch --load mmult-simd.abcl
;;; Using iblock=16, jblock=1536, iblock x jblock x 4 x 3=288k
;;; MATRIX-MULTIPLY-UNROLLED-BLOCKED (1000x1000) by (1000x1000)
;;; 12.501 seconds real time
;;; 4000002 cons cells
;; comment out the next line for older sbcl versions w/o SB-SIMD
#+(and sbcl :X86-64) (push :use-simd *features*)
;; comment out the next line for older CPUs to use SSE
(push :avx256 *features*)
(declaim (optimize (speed 3) (safety 0)
))
#+use-simd
(require "sb-simd")
(deftype array-index ()
`(integer 0 (,array-dimension-limit)))
(deftype array-element ()
`(single-float))
;; Use SSE instructions
#+(and :use-simd (not :avx256))
(progn
(defconstant +simd-size+ (coerce 4 'array-index))
(deftype simd-vector ()
'(sb-simd-sse:f32.4))
(defmacro simd-vector (&rest args)
`(sb-simd-sse:f32.4 ,@args))
(defmacro simd-+ (vec val)
`(sb-simd-sse:f32.4+ ,vec ,val))
(defmacro simd-* (vec val)
`(sb-simd-sse:f32.4* ,vec ,val))
(defmacro simd-row-major-aref (vec &rest subscripts)
`(sb-simd-sse:f32.4-row-major-aref ,vec ,@subscripts))
(format t "Using SSE~%")
)
;; Use AVX256
#+(and :use-simd :avx256)
(progn
(defconstant +simd-size+ (coerce 8 'fixnum))
(deftype simd-vector ()
'(sb-simd-avx:f32.8))
(defmacro simd-vector (&rest args)
`(sb-simd-avx:f32.8 ,@args))
(defmacro simd-+ (vec val)
`(sb-simd-avx:f32.8+ ,vec ,val))
(defmacro simd-* (vec val)
`(sb-simd-avx:f32.8* ,vec ,val))
(defmacro simd-row-major-aref (vec &rest subscripts)
`(sb-simd-avx:f32.8-row-major-aref ,vec ,@subscripts))
(format t "Using AVX-256~%")
)
;; the blocksizes are tailored towards a 64k L1 cache,
;; and the blocksizes x 4 should probably be multiples of 64-byte cachelines
;; (4 bytes for single floats)
(defconstant +iblock-size+ 16)
(defconstant +jblock-size+ 256)
(defconstant +kblock-size+ 16) ; this is only used for matrix-multiply-simd-unrolled-blocked2 which is slower
(format t "Using iblock=~d, jblock=~d, iblock x jblock x 4 x 3=~dk~%"
+iblock-size+ +jblock-size+ (truncate (* +iblock-size+ +jblock-size+ 4 3) 1024))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; various implementations of matrix multiplication
;; use loop order k/i/j so that the hot loop will read consecutive memory locations
(defun matrix-multiply (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(dotimes (k p)
(declare (array-index k))
(dotimes (i m)
(declare (array-index i))
(dotimes (j n)
(declare (array-index j))
(incf (aref result i j) (* (aref a i k) (aref b k j))))))
result))
;; simple optimizations:
;; lift (aref a i k) out of the hot loop
;; calculate addresses by incrementing the indices
(defun matrix-multiply-opt (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(dotimes (k p)
(declare (array-index k))
(dotimes (i m)
(declare (array-index i))
(let ((tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (array-element tmp) (array-index result-idx b-idx))
;(dotimes (j n)
(loop repeat n do
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))))
result))
;; similar to matrix-multiply-opt plus unroll the hot loop by 4
(defun matrix-multiply-unrolled (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(dotimes (k p)
(declare (array-index k))
(dotimes (i m)
(declare (array-index i))
(let ((tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (array-element tmp) (array-index result-idx b-idx))
(do ((j 0 (+ j 4)))
((> j (- n 4))
;(do ((j j (1+ j)))
; ((>= j n))
(loop repeat (- n j) do
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf (row-major-aref result (+ result-idx 1)) (* tmp (row-major-aref b (+ b-idx 1))))
(incf (row-major-aref result (+ result-idx 2)) (* tmp (row-major-aref b (+ b-idx 2))))
(incf (row-major-aref result (+ result-idx 3)) (* tmp (row-major-aref b (+ b-idx 3))))
(incf result-idx 4)
(incf b-idx 4)))))
result))
;; similar to matrix-multiply-opt but with cache blocking
(defun matrix-multiply-blocked (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(do ((iblock 0 (+ iblock +iblock-size+)))
((>= iblock m))
(let ((ilimit (min m (+ iblock +iblock-size+))))
(do ((jblock 0 (+ jblock +jblock-size+)))
((>= jblock n))
(let ((jlimit (min n (+ jblock +jblock-size+))))
(dotimes (k p)
(declare (array-index k))
(do ((i iblock (1+ i)))
((>= i ilimit))
(declare (array-index i))
(let ((tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (array-element tmp) (array-index result-idx b-idx))
;(do ((j jblock (1+ j)))
; ((>= j (min n (+ jblock +jblock-size+))))
(loop repeat (- jlimit jblock) do
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))))))))
result))
;; similar to matrix-multiply-opt but with loop unrolling and cache blocking
(defun matrix-multiply-unrolled-blocked (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(do ((iblock 0 (+ iblock +iblock-size+)))
((>= iblock m))
(let ((ilimit (min m (+ iblock +iblock-size+))))
(do ((jblock 0 (+ jblock +jblock-size+)))
((>= jblock n))
(let ((jlimit (min n (+ jblock +jblock-size+))))
(dotimes (k p)
(declare (array-index k))
(do ((i iblock (1+ i)))
((>= i ilimit))
(declare (array-index i))
(let ((tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (array-element tmp) (array-index result-idx b-idx))
(do ((j jblock (+ j 4)))
((> j (- jlimit 4))
;(do ((jj j (1+ jj)))
; ((>= jj jlimit))
(loop repeat (- jlimit j) do
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf (row-major-aref result (+ result-idx 1)) (* tmp (row-major-aref b (+ b-idx 1))))
(incf (row-major-aref result (+ result-idx 2)) (* tmp (row-major-aref b (+ b-idx 2))))
(incf (row-major-aref result (+ result-idx 3)) (* tmp (row-major-aref b (+ b-idx 3))))
(incf result-idx 4)
(incf b-idx 4)))))))))
result))
;; similar to matrix-multiply-opt except: use SIMD
#+use-simd
(defun matrix-multiply-simd (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(dotimes (k p)
(declare (array-index k))
(dotimes (i m)
(declare (array-index i))
(let ((vtmp (simd-vector (aref a i k)))
(tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (simd-vector vtmp) (array-element tmp) (array-index result-idx b-idx))
(do ((j 0 (+ j +simd-size+)))
((> j (- n +simd-size+))
;(do ((j j (1+ j)))
; ((>= j n))
(loop repeat (- n j) do
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))
(setf #1=(simd-row-major-aref result result-idx)
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) vtmp)))
(incf result-idx +simd-size+)
(incf b-idx +simd-size+)))))
result))
;; similar to matrix-multiply-simd but with cache blocking
#+use-simd
(defun matrix-multiply-simd-blocked (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(do ((iblock 0 (+ iblock +iblock-size+)))
((>= iblock m))
(let ((ilimit (min m (+ iblock +iblock-size+))))
(do ((jblock 0 (+ jblock +jblock-size+)))
((>= jblock n))
(let ((jlimit (min n (+ jblock +jblock-size+))))
(dotimes (k p)
(declare (array-index k))
(do ((i iblock (1+ i)))
((>= i ilimit))
(declare (array-index i))
(let ((vtmp (simd-vector (aref a i k)))
(tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (simd-vector vtmp) (array-element tmp) (array-index result-idx b-idx))
(do ((j jblock (+ j +simd-size+)))
((> j (- jlimit +simd-size+))
;(do ((j j (1+ j)))
; ((>= j jlimit))
(loop repeat (- jlimit j) do
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))
(setf #1=(simd-row-major-aref result result-idx)
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) vtmp)))
(incf result-idx +simd-size+)
(incf b-idx +simd-size+)))))))))
result))
;; similar to matrix-multiply-unrolled except: use SIMD
#+use-simd
(defun matrix-multiply-simd-unrolled (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(dotimes (k p)
(declare (array-index k))
(dotimes (i m)
(declare (array-index i))
(let ((tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (array-element tmp) (array-index result-idx b-idx))
(do ((j 0 (+ j (* 4 +simd-size+))))
((> j (- n (* 4 +simd-size+)))
(do ((jj j (1+ jj)))
((>= jj n))
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))
(setf #1=(simd-row-major-aref result result-idx)
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) tmp)))
(setf #2=(simd-row-major-aref result (+ +simd-size+ result-idx))
(simd-+ #2# (simd-* (simd-row-major-aref b (+ +simd-size+ b-idx)) tmp)))
(setf #3=(simd-row-major-aref result (+ (* 2 +simd-size+) result-idx))
(simd-+ #3# (simd-* (simd-row-major-aref b (+ (* 2 +simd-size+) b-idx)) tmp)))
(setf #4=(simd-row-major-aref result (+ (* 3 +simd-size+) result-idx))
(simd-+ #4# (simd-* (simd-row-major-aref b (+ (* 3 +simd-size+) b-idx)) tmp)))
(incf result-idx (* 4 +simd-size+))
(incf b-idx (* 4 +simd-size+))))))
result))
;; similar to matrix-multiply-unrolled-blocked except: use SIMD
#+use-simd
(defun matrix-multiply-simd-unrolled-blocked (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(do ((iblock 0 (+ iblock +iblock-size+)))
((>= iblock m))
(let ((ilimit (min m (+ iblock +iblock-size+))))
(do ((jblock 0 (+ jblock +jblock-size+)))
((>= jblock n))
(let ((jlimit (min n (+ jblock +jblock-size+))))
(dotimes (k p)
(declare (array-index k))
(do ((i iblock (1+ i)))
((>= i ilimit))
(declare (array-index i))
(let (;(vtmp (simd-vector (aref a i k))) ;; xx
(tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare ;(simd-vector vtmp)
(array-element tmp) (array-index result-idx b-idx jlimit))
(do ((j jblock (+ j (* 4 +simd-size+))))
((> j (- jlimit (* 4 +simd-size+)))
(do ((jj j (+ jj +simd-size+)))
((> jj (- jlimit +simd-size+))
;(do ((jjj jj (1+ jjj)))
; ((>= jjj jlimit))
(loop repeat (- jlimit jj) do
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))
(setf #10=(simd-row-major-aref result result-idx)
(simd-+ #10# (simd-* (simd-row-major-aref b b-idx) tmp)))
(incf result-idx +simd-size+)
(incf b-idx +simd-size+)))
(setf #1=(simd-row-major-aref result result-idx)
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) tmp)))
(setf #2=(simd-row-major-aref result (+ +simd-size+ result-idx))
(simd-+ #2# (simd-* (simd-row-major-aref b (+ +simd-size+ b-idx)) tmp)))
(setf #3=(simd-row-major-aref result (+ (* 2 +simd-size+) result-idx))
(simd-+ #3# (simd-* (simd-row-major-aref b (+ (* 2 +simd-size+) b-idx)) tmp)))
(setf #4=(simd-row-major-aref result (+ (* 3 +simd-size+) result-idx))
(simd-+ #4# (simd-* (simd-row-major-aref b (+ (* 3 +simd-size+) b-idx)) tmp)))
(incf result-idx (* 4 +simd-size+))
(incf b-idx (* 4 +simd-size+))))))))))
result))
;; try a third level of blocking which is not really needed and makes things slower
#+use-simd
(defun matrix-multiply-simd-unrolled-blocked2 (a b)
"Performs matrix multiplication of two arrays."
(declare (type (simple-array array-element 2) a b))
(assert (and (= (array-rank a) (array-rank b) 2)
(= (array-dimension a 1) (array-dimension b 0)))
(a b)
"Cannot multiply ~S by ~S." a b)
(let* ((m (array-dimension a 0))
(n (array-dimension b 1))
(p (array-dimension a 1))
(result (make-array (list m n) :element-type 'array-element
:initial-element 0.0 :adjustable nil :fill-pointer nil)))
(declare (array-index m n p))
(do ((kblock 0 (+ kblock +kblock-size+)))
((>= kblock p))
(do ((iblock 0 (+ iblock +iblock-size+)))
((>= iblock m))
(do ((jblock 0 (+ jblock +jblock-size+)))
((>= jblock n))
(do ((k kblock (1+ k)))
((>= k (min p (+ kblock +kblock-size+))))
(declare (array-index k))
(do ((i iblock (1+ i)))
((>= i (min m (+ iblock +iblock-size+))))
(declare (array-index i))
(let ((tmp (aref a i k))
(result-idx (array-row-major-index result i 0))
(b-idx (array-row-major-index b k 0)))
(declare (array-element tmp) (array-index result-idx b-idx))
(do ((j jblock (+ j (* 4 +simd-size+))))
((> j (min (- n (* 4 +simd-size+)) (+ jblock +jblock-size+)))
(do ((jj j (1+ jj)))
((>= jj (min (- n +simd-size+) (+ jblock +jblock-size+)))
(do ((jjj jj (1+ jjj)))
((>= jjj (min n (+ jblock +jblock-size+))))
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx)))
(incf result-idx)
(incf b-idx)))
(setf #10=(simd-row-major-aref result result-idx)
(simd-+ #10# (simd-* (simd-row-major-aref b b-idx) tmp)))
(incf result-idx +simd-size+)
(incf b-idx +simd-size+)))
(setf #1=(simd-row-major-aref result result-idx)
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) tmp)))
(setf #2=(simd-row-major-aref result (+ +simd-size+ result-idx))
(simd-+ #2# (simd-* (simd-row-major-aref b (+ +simd-size+ b-idx)) tmp)))
(setf #3=(simd-row-major-aref result (+ (* 2 +simd-size+) result-idx))
(simd-+ #3# (simd-* (simd-row-major-aref b (+ (* 2 +simd-size+) b-idx)) tmp)))
(setf #4=(simd-row-major-aref result (+ (* 3 +simd-size+) result-idx))
(simd-+ #4# (simd-* (simd-row-major-aref b (+ (* 3 +simd-size+) b-idx)) tmp)))
(incf result-idx (* 4 +simd-size+))
(incf b-idx (* 4 +simd-size+)))))))))
result))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; tests
;; comment the next line to run a basic test
#+(or)
(let ((x (matrix-multiply (make-array '(2 3) :element-type 'array-element
:initial-contents '((1.0 2.0 3.0) (4.0 5.0 6.0)))
(make-array '(3 2) :element-type 'array-element
:initial-contents '((5.0 6.0) (7.0 8.0) (9.0 10.0))))))
;(print x)
(unless (equalp x #2A((46.0 52.0) (109.0 124.0)))
(error "d'oh")))
(defun random-array (x y)
(let ((a (make-array (list x y) :element-type 'array-element :adjustable nil :fill-pointer nil)))
(dotimes (i x)
(dotimes (j y)
(setf (aref a i j) (- 10.0 (random 20.0)))))
a))
(defun equal-ish (a b)
(let ((x (array-dimension a 0))
(y (array-dimension a 1)))
(dotimes (i x)
(dotimes (j y)
(unless (= (aref a i j) (aref b i j))
(return-from equal-ish nil)))))
(return-from equal-ish t))
;; comment the next line to run tests
#+(or)
(dolist (f '(matrix-multiply
matrix-multiply-opt
matrix-multiply-unrolled
matrix-multiply-blocked
matrix-multiply-unrolled-blocked
#+use-simd matrix-multiply-simd
#+use-simd matrix-multiply-simd-unrolled
#+use-simd matrix-multiply-simd-blocked
#+use-simd matrix-multiply-simd-unrolled-blocked))
(dolist (x '(15 16 17 #|1535 1536 1537|#))
(dolist (y '(1535 1536 1537))
;(format t "testing ~a (~dx~d) x (~dx~d)~%" f x y y x)
#1=(let* ((a (random-array x y))
(b (random-array y x))
(ref (matrix-multiply a b))
(tst (funcall f a b)))
(unless (equal-ish ref tst)
(format t "ERROR: ~a (~dx~d) x (~dx~d) produces wrong result~%" f x y y x)))))
(dotimes (xx 33)
(dotimes (yy 65)
(let ((x (1+ xx))
(y (1+ yy)))
#1#))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; timed runs
(defun arry (dimspec init)
"Create a simple-array and set its contents to 'init'."
(make-array dimspec
:element-type 'array-element
:initial-element init
:adjustable nil
:fill-pointer nil))
(defun mmult (func m p &optional (n m))
(format t "~a (~dx~d) by (~dx~d)~%" func m p p n)
(let ((a (arry (list m p) 1.0))
(b (arry (list p n) 2.0)))
(time (funcall (symbol-function func) a b))))
(mmult 'matrix-multiply 1000 1000) ; 1.566s
;(mmult 'matrix-multiply-opt 1000 1000) ; 0.786s
;(mmult 'matrix-multiply-unrolled 1000 1000) ; 0.714s
;(mmult 'matrix-multiply-blocked 1000 1000) ; 0.775s
;(mmult 'matrix-multiply-unrolled-blocked 1000 1000) ; 0.728s
#+use-simd
(progn
;(mmult 'matrix-multiply-simd 1000 1000) ; 0.172s
;(mmult 'matrix-multiply-simd-unrolled 1000 1000) ; 0.156s
;(mmult 'matrix-multiply-simd-blocked 1000 1000) ; 0.180s
(mmult 'matrix-multiply-simd-unrolled-blocked 1000 1000) ; 0.125s
;;;(mmult 'matrix-multiply-simd-unrolled-blocked2 1000 1000)
)
;(mmult 'matrix-multiply 5000 2000) ; 84.902s
;(mmult 'matrix-multiply-opt 5000 2000) ; 46.415s
;(mmult 'matrix-multiply-unrolled 5000 2000) ; 40.123s
;(mmult 'matrix-multiply-blocked 5000 2000) ; 42.022s
;(mmult 'matrix-multiply-unrolled-blocked 5000 2000) ; 38.968s
#+use-simd
(progn
;(mmult 'matrix-multiply-simd 5000 2000) ; 16.019s
;(mmult 'matrix-multiply-simd-unrolled 5000 2000) ; 15.519s
;(mmult 'matrix-multiply-simd-blocked 5000 2000) ; 9.618s
(mmult 'matrix-multiply-simd-unrolled-blocked 5000 2000) ; 6.358s
;;;(mmult 'matrix-multiply-simd-unrolled-blocked2 5000 2000)
)
;(mmult 'matrix-multiply 2000 5000) ; 36.055
;(mmult 'matrix-multiply-opt 2000 5000) ; 17.820s
;(mmult 'matrix-multiply-unrolled 2000 5000) ; 16.162s
;(mmult 'matrix-multiply-blocked 2000 5000) ; 17.853s
;(mmult 'matrix-multiply-unrolled-blocked 2000 5000) ; 15.578s
#+use-simd
(progn
;(mmult 'matrix-multiply-simd 2000 5000) ; 6.409s
;(mmult 'matrix-multiply-simd-unrolled 2000 5000) ; 6.523s
;(mmult 'matrix-multiply-simd-blocked 2000 5000) ; 3.735s
;(mmult 'matrix-multiply-simd-unrolled-blocked 2000 5000) ; 2.966s
;;;(mmult 'matrix-multiply-simd-unrolled-blocked2 2000 5000)
)
;; time should increase with N^3
;(dolist (x '(100 200 500 1000 2000 5000 10000)) (mmult 'matrix-multiply-simd-unrolled-blocked x 1000))
;; time should increase with N
;(dolist (x '(100 200 500 1000 2000 5000 10000)) (mmult 'matrix-multiply-simd-unrolled-blocked 1000 x))
;#+(and sbcl :X86-64)
;(require :sb-sprof)
;
;#+(and sbcl :X86-64)
;(let ((a (arry '(3000 1000) 1.0))
; (b (arry '(1000 3000) 2.0)))
;
; (sb-sprof:with-profiling (:mode :cpu :loop nil :max-samples 1000 :report :flat :sample-interval 0.1)
; (matrix-multiply-unrolled-blocked a b)))
;(disassemble 'matrix-multiply-opt)
;(disassemble 'matrix-multiply-unrolled)
;(disassemble 'matrix-multiply-blocked)
;(disassemble 'matrix-multiply-unrolled-blocked)
;(disassemble 'matrix-multiply-simd-unrolled-blocked)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment