Skip to content

Instantly share code, notes, and snippets.

@mengzhuo
Created December 20, 2017 08:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mengzhuo/bb3769d42097eec6f3fce12895e441b9 to your computer and use it in GitHub Desktop.
Save mengzhuo/bb3769d42097eec6f3fce12895e441b9 to your computer and use it in GitHub Desktop.
ARM64 benchmark
│BenchmarkAmove-4 20000000 102 ns/op 1881.62 MB/s
│BenchmarkBmove-4 50000000 34.5 ns/op 5563.29 MB/s
package simd
//go:onescape
func amove(dst, src []byte, count uint64)
//go:onescape
func bmove(dst, src []byte, count uint64)
#include "textflag.h"
TEXT ·amove(SB),NOSPLIT,$56
MOVD dst+0(FP), R0
MOVD src+24(FP), R1
MOVD count+48(FP), R2
PRFM (R1), PLDL1KEEP
loop:
VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R1), [V4.B16, V5.B16, V6.B16, V7.B16]
VLD1.P 64(R1), [V8.B16, V9.B16, V10.B16, V11.B16]
VLD1.P 64(R1), [V12.B16, V13.B16, V14.B16, V15.B16]
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R0)
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
SUBS $256, R2, R2
BGT loop
RET
TEXT ·bmove(SB),NOSPLIT,$56
MOVD dst+0(FP), R0
MOVD src+24(FP), R1
MOVD count+48(FP), R2
PRFM (R1), PLDL1KEEP
bloop:
LDP (R1), (R4, R5)
LDP 16(R1), (R6, R7)
LDP 32(R1), (R8, R9)
LDP 48(R1), (R10, R11)
LDP 64(R1), (R12, R13)
LDP 80(R1), (R14, R15)
LDP 96(R1), (R16, R17)
LDP 112(R1), (R18, R19)
LDP 128(R1), (R20, R21)
LDP 144(R1), (R22, R23)
LDP 160(R1), (R24, R25)
LDP.W 176(R1), (R26, R27)
STP (R4, R5) ,(R0)
STP (R6, R7) ,16(R0)
STP (R8, R9) ,32(R0)
STP (R10, R11) ,48(R0)
STP (R12, R13) ,64(R0)
STP (R14, R15) ,80(R0)
STP (R16, R17) ,96(R0)
STP (R18, R19) ,112(R0)
STP (R20, R21) ,128(R0)
STP (R22, R23) ,144(R0)
STP (R24, R25) ,160(R0)
STP.W (R26, R27) ,176(R0)
SUBS $192, R2, R2
BGT bloop
RET
package simd
import (
"crypto/rand"
"testing"
)
const bulk = 3
func BenchmarkAmove(b *testing.B) {
dst := make([]byte, 64*bulk)
src := make([]byte, 64*bulk)
b.SetBytes(64 * bulk)
b.ResetTimer()
for i := 0; i < b.N; i++ {
amove(dst, src, 64*bulk)
}
}
func BenchmarkBmove(b *testing.B) {
dst := make([]byte, 64*bulk)
src := make([]byte, 64*bulk)
b.SetBytes(64 * bulk)
b.ResetTimer()
for i := 0; i < b.N; i++ {
bmove(dst, src, 64*bulk)
}
}
func TestAmove(t *testing.T) {
dst := make([]byte, 64*bulk)
chk := make([]byte, 64*bulk)
src := make([]byte, 64*bulk)
rand.Read(src)
copy(chk, src)
amove(dst, src, 64*bulk)
for i, c := range chk {
if dst[i] != c {
t.Errorf("dst[%d]%x != src[%d]%x", i, dst[i], i, c)
break
}
}
}
func TestBmove(t *testing.T) {
dst := make([]byte, 64*bulk)
chk := make([]byte, 64*bulk)
src := make([]byte, 64*bulk)
rand.Read(src)
copy(chk, src)
bmove(dst, src, 64*bulk)
for i, c := range chk {
if dst[i] != c {
t.Errorf("dst[%d]%x != src[%d]%x", i, dst[i], i, c)
break
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment