Skip to content

Instantly share code, notes, and snippets.

@win0err
Last active February 18, 2018 18:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save win0err/5de6adb530c70d9f5da05078b272ddd8 to your computer and use it in GitHub Desktop.
Save win0err/5de6adb530c70d9f5da05078b272ddd8 to your computer and use it in GitHub Desktop.
Умножение матрицы на вектор, ассемблерная версия
package main
import (
"fmt"
"math"
"time"
)
type V4 [4]float32
type M4 [16]float32
const (
numVectors = 8 * 1024 * 1024
)
func multiply(data []V4, m M4)
func benchmark(f func()) {
best := math.MaxFloat64
for i := 0; i < 100; i++ {
start := time.Now()
f()
end := time.Now()
elapsed := end.Sub(start).Seconds()
if elapsed < best {
best = elapsed
}
}
fmt.Printf("%.0fms\n", best*1000)
}
func main() {
data := make([]V4, numVectors)
for i := range data {
data[i] = V4{1, 2, 3, 4}
}
m := M4{
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 14, 15, 16,
}
benchmark(func() { multiply(data, m) })
}
#include "textflag.h"
TEXT ·multiply(SB),NOSPLIT,$0
// data ptr
MOVQ data+0(FP), CX
// data len
MOVQ data+8(FP), SI
// index into data
MOVQ $0, AX
// return early if zero length
CMPQ AX, SI
JE END
LOOP:
MOVSS 0(CX), X0
MOVSS 4(CX), X1
MOVSS 8(CX), X2
MOVSS 12(CX), X3
MOVSS m+24(FP), X4
MOVSS m+40(FP), X5
MOVSS m+56(FP), X6
MOVSS m+72(FP), X7
MULSS X0, X4
MULSS X1, X5
MULSS X2, X6
MULSS X3, X7
ADDSS X5, X4
ADDSS X6, X4
ADDSS X7, X4
MOVSS X4, 0(CX)
MOVSS m+28(FP), X4
MOVSS m+44(FP), X5
MOVSS m+60(FP), X6
MOVSS m+76(FP), X7
MULSS X0, X4
MULSS X1, X5
MULSS X2, X6
MULSS X3, X7
ADDSS X5, X4
ADDSS X6, X4
ADDSS X7, X4
MOVSS X4, 4(CX)
MOVSS m+32(FP), X4
MOVSS m+48(FP), X5
MOVSS m+64(FP), X6
MOVSS m+80(FP), X7
MULSS X0, X4
MULSS X1, X5
MULSS X2, X6
MULSS X3, X7
ADDSS X5, X4
ADDSS X6, X4
ADDSS X7, X4
MOVSS X4, 8(CX)
MOVSS m+36(FP), X4
MOVSS m+52(FP), X5
MOVSS m+68(FP), X6
MOVSS m+84(FP), X7
MULSS X0, X4
MULSS X1, X5
MULSS X2, X6
MULSS X3, X7
ADDSS X5, X4
ADDSS X6, X4
ADDSS X7, X4
MOVSS X4, 12(CX)
// data++
ADDQ $16, CX
// i++
INCQ AX
// if i >= len(data) break
CMPQ AX, SI
JLT LOOP
END:
RET
go build -o process-vectors-plain && ./process-vectors-plain
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment