Skip to content

Instantly share code, notes, and snippets.

@donovanhide
Created December 15, 2012 17:21
Show Gist options
  • Save donovanhide/4297349 to your computer and use it in GitHub Desktop.
Save donovanhide/4297349 to your computer and use it in GitHub Desktop.
First attempt at fast summing of a slice of uint8 using SSE assembly
go test sum -v -benchtime=0.1s
=== RUN TestSmallSumBytes
--- PASS: TestSmallSumBytes (0.00 seconds)
=== RUN TestSumBytes
--- PASS: TestSumBytes (2.52 seconds)
=== RUN TestBenchmark
--- PASS: TestBenchmark (10.96 seconds)
sum_test.go:75: Benchmark Results
Length: 0 Fast: 5.42 ns/op Slow: 4.93 ns/op Improvement -9.03%
Length: 1 Fast: 7.56 ns/op Slow: 5.26 ns/op Improvement -30.39%
Length: 2 Fast: 7.81 ns/op Slow: 5.88 ns/op Improvement -24.71%
Length: 3 Fast: 7.62 ns/op Slow: 7.04 ns/op Improvement -7.57%
Length: 4 Fast: 7.69 ns/op Slow: 8.07 ns/op Improvement 4.88%
Length: 5 Fast: 8.02 ns/op Slow: 8.97 ns/op Improvement 11.92%
Length: 6 Fast: 7.68 ns/op Slow: 10.09 ns/op Improvement 31.37%
Length: 7 Fast: 9.74 ns/op Slow: 11.31 ns/op Improvement 16.12%
Length: 8 Fast: 7.84 ns/op Slow: 12.65 ns/op Improvement 61.35%
Length: 15 Fast: 7.70 ns/op Slow: 21.91 ns/op Improvement 184.54%
Length: 16 Fast: 8.02 ns/op Slow: 23.20 ns/op Improvement 189.05%
Length: 31 Fast: 9.33 ns/op Slow: 42.76 ns/op Improvement 358.28%
Length: 32 Fast: 7.91 ns/op Slow: 43.11 ns/op Improvement 444.86%
Length: 63 Fast: 12.29 ns/op Slow: 81.86 ns/op Improvement 566.01%
Length: 64 Fast: 10.50 ns/op Slow: 86.07 ns/op Improvement 719.70%
Length: 100 Fast: 15.70 ns/op Slow: 106.16 ns/op Improvement 576.10%
Length: 135 Fast: 18.97 ns/op Slow: 146.31 ns/op Improvement 671.24%
Length: 256 Fast: 38.62 ns/op Slow: 263.35 ns/op Improvement 581.92%
Length: 1024 Fast: 112.32 ns/op Slow: 1000.87 ns/op Improvement 791.06%
Length: 65536 Fast: 6490.84 ns/op Slow: 62188.00 ns/op Improvement 858.09%
PASS
ok sum 13.611s
DATA lut+0x00(SB)/8, $0x00000000000000FF
DATA lut+0x08(SB)/8, $0x0000000000000000
DATA lut+0x10(SB)/8, $0x000000000000FFFF
DATA lut+0x18(SB)/8, $0x0000000000000000
DATA lut+0x20(SB)/8, $0x0000000000FFFFFF
DATA lut+0x28(SB)/8, $0x0000000000000000
DATA lut+0x30(SB)/8, $0x00000000FFFFFFFF
DATA lut+0x38(SB)/8, $0x0000000000000000
DATA lut+0x40(SB)/8, $0x000000FFFFFFFFFF
DATA lut+0x48(SB)/8, $0x0000000000000000
DATA lut+0x50(SB)/8, $0x0000FFFFFFFFFFFF
DATA lut+0x58(SB)/8, $0x0000000000000000
DATA lut+0x60(SB)/8, $0x00FFFFFFFFFFFFFF
DATA lut+0x68(SB)/8, $0x0000000000000000
DATA lut+0x70(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0x78(SB)/8, $0x0000000000000000
DATA lut+0x80(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0x88(SB)/8, $0x00000000000000FF
DATA lut+0x90(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0x98(SB)/8, $0x000000000000FFFF
DATA lut+0xA0(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0xA8(SB)/8, $0x0000000000FFFFFF
DATA lut+0xB0(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0xB8(SB)/8, $0x00000000FFFFFFFF
DATA lut+0xC0(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0xC8(SB)/8, $0x000000FFFFFFFFFF
DATA lut+0xD0(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0xD8(SB)/8, $0x0000FFFFFFFFFFFF
DATA lut+0xE0(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0xE8(SB)/8, $0x00FFFFFFFFFFFFFF
DATA lut+0xF0(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA lut+0xF8(SB)/8, $0xFFFFFFFFFFFFFFFF
GLOBL lut(SB), $0x100
// func FastSumUint8(x []uint8) uint64
TEXT ·FastSumUint8(SB),7,$0
MOVQ x+0(FP), SI // data pointer
MOVQ x+8(FP), CX // len(x)
XORQ AX, AX // result
PXOR X0,X0 // parallel total
CMPQ CX,$0
// Zero Length
JE done
// Less than 16 bytes
CMPQ CX,$16
JLE small
large:
PXOR X1, X1
PSADBW (SI),X1
PADDQ X1,X0
ADDQ $16, SI
SUBQ $16, CX
CMPQ CX, $16
JGE large
CMPQ CX,$0
JE total
small:
MOVOU (SI),X1
PXOR X3, X3
SUBQ $1,CX
ADDQ CX,CX
MOVOU lut(SB)(CX*8),X2
PAND X2,X1
PSADBW X1,X3
PADDQ X3,X0
total:
MOVQ X0,BX
MOVHLPS X0,X1
MOVQ X1,AX
ADDQ BX,AX
done:
MOVQ AX,ret+24(FP)
RET
package sum
import (
"fmt"
"math/rand"
"reflect"
"testing"
"testing/quick"
)
func FastSumUint8(x []uint8) uint64
func SumUint8(x []uint8) uint64 {
sum := uint64(0)
for _, v := range x {
sum += uint64(v)
}
return sum
}
type TestSlice []uint8
func buildTestSlice(length int, capacity int) TestSlice {
test := make(TestSlice, length, capacity)
for i := range test {
test[i] = uint8(rand.Uint32())
}
return test
}
func (s TestSlice) Generate(rand *rand.Rand, size int) reflect.Value {
length := rand.Int() % (1 << 16)
return reflect.ValueOf(buildTestSlice(length, length+rand.Int()%1024))
}
func TestSmallSumBytes(t *testing.T) {
slice := buildTestSlice(32, 32)
for i := range slice {
slow := SumUint8(slice[:i])
fast := FastSumUint8(slice[:i])
if fast != slow {
t.Errorf("%v %v Fast: %v Slow: %v", i, slice[:i], fast, slow)
}
}
}
func TestSumBytes(t *testing.T) {
config := &quick.Config{MaxCount: 1 << 10}
fast := func(x TestSlice) uint64 { return FastSumUint8(x) }
slow := func(x TestSlice) uint64 { return SumUint8(x) }
if err := quick.CheckEqual(fast, slow, config); err != nil {
t.Error(err)
}
}
func runBenchmark(f func(x []uint8) uint64, numbers []uint8, b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
f(numbers)
}
}
func TestBenchmark(t *testing.T) {
benchmarks := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 31, 32, 63, 64, 100, 135, 256, 1 << 10, 1 << 16}
results := "Benchmark Results\n"
for _, n := range benchmarks {
slice := buildTestSlice(n, n)
fast := testing.Benchmark(func(b *testing.B) { runBenchmark(FastSumUint8, slice, b) })
slow := testing.Benchmark(func(b *testing.B) { runBenchmark(SumUint8, slice, b) })
fastNsPerOp := float64(fast.T.Nanoseconds()) / float64(fast.N)
slowNsPerOp := float64(slow.T.Nanoseconds()) / float64(slow.N)
results += fmt.Sprintf("Length: %8d Fast:%12.2f ns/op Slow:%12.2f ns/op Improvement %6.2f%%\n", n, fastNsPerOp, slowNsPerOp, ((1/(fastNsPerOp/slowNsPerOp))-1)*100)
}
t.Log(results)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment