Create a gist now

Instantly share code, notes, and snippets.

Embed
ARM64 benchmark
│BenchmarkAmove-4 20000000 102 ns/op 1881.62 MB/s
│BenchmarkBmove-4 50000000 34.5 ns/op 5563.29 MB/s
package simd
//go:onescape
func amove(dst, src []byte, count uint64)
//go:onescape
func bmove(dst, src []byte, count uint64)
#include "textflag.h"
TEXT ·amove(SB),NOSPLIT,$56
MOVD dst+0(FP), R0
MOVD src+24(FP), R1
MOVD count+48(FP), R2
PRFM (R1), PLDL1KEEP
loop:
VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R1), [V4.B16, V5.B16, V6.B16, V7.B16]
VLD1.P 64(R1), [V8.B16, V9.B16, V10.B16, V11.B16]
VLD1.P 64(R1), [V12.B16, V13.B16, V14.B16, V15.B16]
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R0)
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
SUBS $256, R2, R2
BGT loop
RET
TEXT ·bmove(SB),NOSPLIT,$56
MOVD dst+0(FP), R0
MOVD src+24(FP), R1
MOVD count+48(FP), R2
PRFM (R1), PLDL1KEEP
bloop:
LDP (R1), (R4, R5)
LDP 16(R1), (R6, R7)
LDP 32(R1), (R8, R9)
LDP 48(R1), (R10, R11)
LDP 64(R1), (R12, R13)
LDP 80(R1), (R14, R15)
LDP 96(R1), (R16, R17)
LDP 112(R1), (R18, R19)
LDP 128(R1), (R20, R21)
LDP 144(R1), (R22, R23)
LDP 160(R1), (R24, R25)
LDP.W 176(R1), (R26, R27)
STP (R4, R5) ,(R0)
STP (R6, R7) ,16(R0)
STP (R8, R9) ,32(R0)
STP (R10, R11) ,48(R0)
STP (R12, R13) ,64(R0)
STP (R14, R15) ,80(R0)
STP (R16, R17) ,96(R0)
STP (R18, R19) ,112(R0)
STP (R20, R21) ,128(R0)
STP (R22, R23) ,144(R0)
STP (R24, R25) ,160(R0)
STP.W (R26, R27) ,176(R0)
SUBS $192, R2, R2
BGT bloop
RET
package simd
import (
"crypto/rand"
"testing"
)
const bulk = 3
func BenchmarkAmove(b *testing.B) {
dst := make([]byte, 64*bulk)
src := make([]byte, 64*bulk)
b.SetBytes(64 * bulk)
b.ResetTimer()
for i := 0; i < b.N; i++ {
amove(dst, src, 64*bulk)
}
}
func BenchmarkBmove(b *testing.B) {
dst := make([]byte, 64*bulk)
src := make([]byte, 64*bulk)
b.SetBytes(64 * bulk)
b.ResetTimer()
for i := 0; i < b.N; i++ {
bmove(dst, src, 64*bulk)
}
}
func TestAmove(t *testing.T) {
dst := make([]byte, 64*bulk)
chk := make([]byte, 64*bulk)
src := make([]byte, 64*bulk)
rand.Read(src)
copy(chk, src)
amove(dst, src, 64*bulk)
for i, c := range chk {
if dst[i] != c {
t.Errorf("dst[%d]%x != src[%d]%x", i, dst[i], i, c)
break
}
}
}
func TestBmove(t *testing.T) {
dst := make([]byte, 64*bulk)
chk := make([]byte, 64*bulk)
src := make([]byte, 64*bulk)
rand.Read(src)
copy(chk, src)
bmove(dst, src, 64*bulk)
for i, c := range chk {
if dst[i] != c {
t.Errorf("dst[%d]%x != src[%d]%x", i, dst[i], i, c)
break
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment