-
-
Save win0err/a613d6194c9d4fde7b821e64836faf42 to your computer and use it in GitHub Desktop.
Умножение матрицы на вектор, ассемблерная версия с SIMD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"math" | |
"time" | |
) | |
type V4 [4]float32 | |
type M4 [16]float32 | |
const ( | |
numVectors = 8 * 1024 * 1024 | |
) | |
func multiply(data []V4, m M4) | |
func benchmark(f func()) { | |
best := math.MaxFloat64 | |
for i := 0; i < 100; i++ { | |
start := time.Now() | |
f() | |
end := time.Now() | |
elapsed := end.Sub(start).Seconds() | |
if elapsed < best { | |
best = elapsed | |
} | |
} | |
fmt.Printf("%.0fms\n", best*1000) | |
} | |
func main() { | |
data := make([]V4, numVectors) | |
for i := range data { | |
data[i] = V4{1, 2, 3, 4} | |
} | |
m := M4{ | |
1, 2, 3, 4, | |
5, 6, 7, 8, | |
9, 10, 11, 12, | |
13, 14, 15, 16, | |
} | |
benchmark(func() { multiply(data, m) }) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "textflag.h" | |
// func multiply(data []V4, m M4) | |
// | |
// memory layout of the stack relative to FP | |
// +0 data slice ptr | |
// +8 data slice len | |
// +16 data slice cap | |
// +24 m[0] | m[1] | |
// +32 m[2] | m[3] | |
// +40 m[4] | m[5] | |
// +48 m[6] | m[7] | |
// +56 m[8] | m[9] | |
// +64 m[10] | m[11] | |
// +72 m[12] | m[13] | |
// +80 m[14] | m[15] | |
TEXT ·multiply(SB),NOSPLIT,$0 | |
// data ptr | |
MOVQ data+0(FP), CX | |
// data len | |
MOVQ data+8(FP), SI | |
// index into data | |
MOVQ $0, AX | |
// return early if zero length | |
CMPQ AX, SI | |
JE END | |
// load the matrix into 128-bit wide xmm registers | |
// load [m[0], m[1], m[2], m[3]] into xmm0 | |
MOVUPS m+24(FP), X0 | |
// load [m[4], m[5], m[6], m[7]] into xmm1 | |
MOVUPS m+40(FP), X1 | |
// load [m[8], m[9], m[10], m[11]] into xmm2 | |
MOVUPS m+56(FP), X2 | |
// load [m[12], m[13], m[14], m[15]] into xmm3 | |
MOVUPS m+72(FP), X3 | |
LOOP: | |
// load each component of the vector into xmm registers | |
// load data[i][0] (x) into xmm4 | |
MOVSS 0(CX), X4 | |
// load data[i][1] (y) into xmm5 | |
MOVSS 4(CX), X5 | |
// load data[i][2] (z) into xmm6 | |
MOVSS 8(CX), X6 | |
// load data[i][3] (w) into xmm7 | |
MOVSS 12(CX), X7 | |
// copy each component of the matrix across each register | |
// [0, 0, 0, x] => [x, x, x, x] | |
SHUFPS $0, X4, X4 | |
// [0, 0, 0, y] => [y, y, y, y] | |
SHUFPS $0, X5, X5 | |
// [0, 0, 0, z] => [z, z, z, z] | |
SHUFPS $0, X6, X6 | |
// [0, 0, 0, w] => [w, w, w, w] | |
SHUFPS $0, X7, X7 | |
// xmm4 = [m[0], m[1], m[2], m[3]] .* [x, x, x, x] | |
MULPS X0, X4 | |
// xmm5 = [m[4], m[5], m[6], m[7]] .* [y, y, y, y] | |
MULPS X1, X5 | |
// xmm6 = [m[8], m[9], m[10], m[11]] .* [z, z, z, z] | |
MULPS X2, X6 | |
// xmm7 = [m[12], m[13], m[14], m[15]] .* [w, w, w, w] | |
MULPS X3, X7 | |
// xmm4 = xmm4 + xmm5 | |
ADDPS X5, X4 | |
// xmm4 = xmm4 + xmm6 | |
ADDPS X6, X4 | |
// xmm4 = xmm4 + xmm7 | |
ADDPS X7, X4 | |
// data[i] = xmm4 | |
MOVNTPS X4, 0(CX) | |
// data++ | |
ADDQ $16, CX | |
// i++ | |
INCQ AX | |
// if i >= len(data) break | |
CMPQ AX, SI | |
JLT LOOP | |
END: | |
// since we use a non-temporal write (MOVNTPS) | |
// make sure all writes are visible before we leave the function | |
SFENCE | |
RET |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
go build -o process-vectors-simd && ./process-vectors-simd |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment