Merovius/README.md

## README.md

      
    Raw
  

              README.md
            
          
    The post Go is not C, so there is not an extreme fast way to merge slices
alleges a performance problem with Go's model of zeroing memory on allocation,
in cases where it might not be needed. The methodology is

Run a benchmark that merges some slices, by makeing one of the appropriate
size and copying the individual slices over
Run a benchmark that zeros a slice of the appropriate slice
Subtract the two numbers and call it the "overhead of zeroing"

I have some trouble with that methodology. For one, it assumes that the zeroing
actually happens on every allocation. However, there are strategies the runtime
could employ that would mitigate that cost, while still only returning zeroed
memory. For example, it could zero memory when releasing it, instead of when
allocating it - and only request zeroed pages from the OS (or zero pages when
getting them from the OS). That would mean under a realistic workload, the cost
of zeroing is paid mostly or only in a background GC thread. And as far as I know,
the Go runtime actually does something like that.
So I set out to write a better benchmark. I added a function to the runtime, which
allocates a slice without zeroing it. I then made that available using //go:linkname.
I then ran the same benchmarks the post above runs, once using a regular make and
once using my modified version. These are the results on my machine (I ran the benchmark
a couple of times to make sure the numbers are stable):
goos: linux       
goarch: amd64
pkg: x
cpu: Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
Benchmark_MergeSlicesZero-4     	   58723	     20018 ns/op
Benchmark_MergeSlicesNoZero-4   	   70490	     16722 ns/op
PASS
ok  	x	2.740s

So it does seem there is a significant effect for this benchmark, even under my revised methodology.
The profiling graphs also show the function memclrNoHeapPointers using significant CPU time for the
version with plain make and it doesn't have that function for the version with the modified make.
So, while I set out to show the original post was wrong, this seems to confirm their findings. I'm still
not entirely convinced this problem is significant in a real workload (where other allocations happen
and the GC would have more opportunities to do work in the background). But at least my immediate criticism
of the posts methodology is invalidated.
I attached the runtime patch (against go1.17beta1, commit dc00dc6c6bf3b5554e37f60799aec092276ff807), as
well as the benchmark code I used and the pprof output graphs below, so you can verify the results yourself
and experiment further.

  
## main_test.go
package main

import (
	"reflect"
	"testing"
	"unsafe"
)

var (
	x = make([]byte, 10000)
	y = make([]byte, 20000)
	z = make([]byte, 50000)

	w = MergeSlicesZero(x, y, z)

	r [128][]byte

	byteType unsafe.Pointer
)

func init() {
	var x interface{} = byte(0)
	byteType = *(*unsafe.Pointer)(unsafe.Pointer(&x))
}

//go:linkname makesliceNoZero runtime.makesliceNoZero
func makesliceNoZero(et unsafe.Pointer, len, cap int) unsafe.Pointer

func makeNoZero(len, cap int) (s []byte) {
	data := makesliceNoZero(byteType, len, cap)
	*(*reflect.SliceHeader)(unsafe.Pointer(&s)) = reflect.SliceHeader{
		Data: uintptr(data),
		Len:  len,
		Cap:  cap,
	}
	return s
}

func MergeSlicesZero(data ...[]byte) []byte {
	type _ int
	n := 0
	for _, s := range data {
		n += len(s)
	}
	r := make([]byte, 0, n)
	for _, s := range data {
		r = append(r, s...)
	}
	return r
}

func MergeSlicesNoZero(data ...[]byte) []byte {
	type _ int
	n := 0
	for _, s := range data {
		n += len(s)
	}
	r := makeNoZero(0, n)
	for _, s := range data {
		r = append(r, s...)
	}
	return r
}

func Benchmark_MergeSlicesZero(b *testing.B) {
	for i := 0; i < b.N; i++ {
		r[i&127] = MergeSlicesZero(x, y, z)
	}
}

func Benchmark_MergeSlicesNoZero(b *testing.B) {
	for i := 0; i < b.N; i++ {
		r[i&127] = MergeSlicesNoZero(x, y, z)
	}
}

## makesliceNoZero.patch
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index f9d4154acf..43839d38ae 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -98,6 +98,24 @@ func makeslice(et *_type, len, cap int) unsafe.Pointer {
 	return mallocgc(mem, et, true)
 }

+func makesliceNoZero(et *_type, len, cap int) unsafe.Pointer {
+	mem, overflow := math.MulUintptr(et.size, uintptr(cap))
+	if overflow || mem > maxAlloc || len < 0 || len > cap {
+		// NOTE: Produce a 'len out of range' error instead of a
+		// 'cap out of range' error when someone does make([]T, bignumber).
+		// 'cap out of range' is true too, but since the cap is only being
+		// supplied implicitly, saying len is clearer.
+		// See golang.org/issue/4085.
+		mem, overflow := math.MulUintptr(et.size, uintptr(len))
+		if overflow || mem > maxAlloc || len < 0 {
+			panicmakeslicelen()
+		}
+		panicmakeslicecap()
+	}
+
+	return mallocgc(mem, et, false)
+}
+
 func makeslice64(et *_type, len64, cap64 int64) unsafe.Pointer {
 	len := int(len64)
 	if int64(len) != len64 {

## pprof_nozero.svg

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              pprof_nozero.svg
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## pprof_zreo.svg

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              pprof_zreo.svg
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	package main

	import (
	"reflect"
	"testing"
	"unsafe"
	)

	var (
	x = make([]byte, 10000)
	y = make([]byte, 20000)
	z = make([]byte, 50000)

	w = MergeSlicesZero(x, y, z)

	r [128][]byte

	byteType unsafe.Pointer
	)

	func init() {
	var x interface{} = byte(0)
	byteType = (unsafe.Pointer)(unsafe.Pointer(&x))
	}

	//go:linkname makesliceNoZero runtime.makesliceNoZero
	func makesliceNoZero(et unsafe.Pointer, len, cap int) unsafe.Pointer

	func makeNoZero(len, cap int) (s []byte) {
	data := makesliceNoZero(byteType, len, cap)
	(reflect.SliceHeader)(unsafe.Pointer(&s)) = reflect.SliceHeader{
	Data: uintptr(data),
	Len: len,
	Cap: cap,
	}
	return s
	}

	func MergeSlicesZero(data ...[]byte) []byte {
	type _ int
	n := 0
	for _, s := range data {
	n += len(s)
	}
	r := make([]byte, 0, n)
	for _, s := range data {
	r = append(r, s...)
	}
	return r
	}

	func MergeSlicesNoZero(data ...[]byte) []byte {
	type _ int
	n := 0
	for _, s := range data {
	n += len(s)
	}
	r := makeNoZero(0, n)
	for _, s := range data {
	r = append(r, s...)
	}
	return r
	}

	func Benchmark_MergeSlicesZero(b *testing.B) {
	for i := 0; i < b.N; i++ {
	r[i&127] = MergeSlicesZero(x, y, z)
	}
	}

	func Benchmark_MergeSlicesNoZero(b *testing.B) {
	for i := 0; i < b.N; i++ {
	r[i&127] = MergeSlicesNoZero(x, y, z)
	}
	}
	diff --git a/src/runtime/slice.go b/src/runtime/slice.go
	index f9d4154acf..43839d38ae 100644
	--- a/src/runtime/slice.go
	+++ b/src/runtime/slice.go
	@@ -98,6 +98,24 @@ func makeslice(et *_type, len, cap int) unsafe.Pointer {
	return mallocgc(mem, et, true)
	}

	+func makesliceNoZero(et *_type, len, cap int) unsafe.Pointer {
	+ mem, overflow := math.MulUintptr(et.size, uintptr(cap))
	+ if overflow \|\| mem > maxAlloc \|\| len < 0 \|\| len > cap {
	+ // NOTE: Produce a 'len out of range' error instead of a
	+ // 'cap out of range' error when someone does make([]T, bignumber).
	+ // 'cap out of range' is true too, but since the cap is only being
	+ // supplied implicitly, saying len is clearer.
	+ // See golang.org/issue/4085.
	+ mem, overflow := math.MulUintptr(et.size, uintptr(len))
	+ if overflow \|\| mem > maxAlloc \|\| len < 0 {
	+ panicmakeslicelen()
	+ }
	+ panicmakeslicecap()
	+ }
	+
	+ return mallocgc(mem, et, false)
	+}
	+
	func makeslice64(et *_type, len64, cap64 int64) unsafe.Pointer {
	len := int(len64)
	if int64(len) != len64 {