sumerc/gist:35fe8c213fb7623360157f28be5aebd2

## gistfile1.txt
package main

import (
	"bytes"
	"fmt"
	"os"
	//"runtime"
	"runtime/pprof"
	"strconv"
	"sync"
	"time"

	pprofile "github.com/google/pprof/profile"
)

var wg sync.WaitGroup

/*
How I ran it:
go build main2.go && sudo GOGC=off perf stat -d  ./main2 20 profile
and
go build main2.go && sudo GOGC=off perf stat -d  ./main2 20 profileoff
and compare the elapsed time:
here is an example result:
pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d  ./main2 20 profileoff                                             ~/Desktop/p/my-go-playground/pprof-overhead
Starting benchmark...
Done. avg: 397.335839ms, min:391.252319ms, max:407.334504ms

 Performance counter stats for './main2 20 profileoff':

          8,202.59 msec task-clock                #    1.001 CPUs utilized
             1,193      context-switches          #    0.145 K/sec
                 5      cpu-migrations            #    0.001 K/sec
         3,870,412      page-faults               #    0.472 M/sec
    36,335,529,823      cycles                    #    4.430 GHz                      (50.06%)
    97,220,434,963      instructions              #    2.68  insn per cycle           (62.58%)
    24,922,002,510      branches                  # 3038.307 M/sec                    (62.58%)
         8,874,605      branch-misses             #    0.04% of all branches          (62.58%)
    31,584,655,224      L1-dcache-loads           # 3850.569 M/sec                    (62.58%)
       260,937,935      L1-dcache-load-misses     #    0.83% of all L1-dcache hits    (62.42%)
         9,954,231      LLC-loads                 #    1.214 M/sec                    (49.90%)
         4,206,466      LLC-load-misses           #   42.26% of all LL-cache hits     (49.90%)

       8.196124365 seconds time elapsed

       5.419365000 seconds user
       2.789732000 seconds sys


pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d  ./main2 20 profile                                                ~/Desktop/p/my-go-playground/pprof-overhead
cpu profiler is on
Starting benchmark...
Done. avg: 402.060052ms, min:393.520596ms, max:413.33551ms

 Performance counter stats for './main2 20 profile':

          8,347.27 msec task-clock                #    0.981 CPUs utilized
             1,682      context-switches          #    0.202 K/sec
                12      cpu-migrations            #    0.001 K/sec
         3,871,087      page-faults               #    0.464 M/sec
    36,663,980,903      cycles                    #    4.392 GHz                      (50.12%)
    97,169,632,931      instructions              #    2.65  insn per cycle           (62.62%)
    24,851,892,104      branches                  # 2977.247 M/sec                    (62.63%)
         9,278,597      branch-misses             #    0.04% of all branches          (62.66%)
    31,504,531,471      L1-dcache-loads           # 3774.231 M/sec                    (62.66%)
       523,097,299      L1-dcache-load-misses     #    1.66% of all L1-dcache hits    (62.33%)
        19,507,961      LLC-loads                 #    2.337 M/sec                    (49.82%)
         4,100,867      LLC-load-misses           #   21.02% of all LL-cache hits     (49.79%)

       8.511315596 seconds time elapsed

       5.498909000 seconds user
       2.863598000 seconds sys
*/

/*
> lscpu
Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              12
On-line CPU(s) list: 0-11
Thread(s) per core:  2
Core(s) per socket:  6
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               158
Model name:          Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
Stepping:            10
CPU MHz:             1556.894
CPU max MHz:         4600.0000
CPU min MHz:         800.0000
BogoMIPS:            6399.96
Virtualization:      VT-x
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            12288K
NUMA node0 CPU(s):   0-11
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d arch_capabilities
*/

const nrow = 400
const ncol = 60

var x [][]float32
var y [][]float32
var out [][]float32

// - when out is local to the function it matters, because when it is global it will already be cached to the L2, I guess?
// - increasing nrow/ncol > l2 cache mattered a bit. I saw consistent %2
// - I get worse results when there are runtime.NumCPU() goroutines (more threads). So, a signal
//   fairly selects one of these threads and make the CPU cache dirty. However, I expect same to happen on a single thread
//   with GOMAXPROCS? But maybe: GOMAXPROCS already obsoletes the CPU cache because of concurrent runs of GC. So, I think
//   having more threads is a better idea. Try GOMAXPROCS with GOGC=off ...

func initMatrix() [][]float32 {
	r := make([][]float32, nrow)
	for i := 0; i < nrow; i++ {
		r[i] = make([]float32, ncol)
		// for j := 0; j < ncol; j++ {
		// 	r[i][j] = float32(i + j)
		// }
	}
	return r
}

func combinationMatrix() {
	out := initMatrix()
	for i := 0; i < len(x); i++ {
		for j := 0; j < len(y[0]); j++ {
			//for k := 0; k < len(y); k++ {
			if out[i][j] == 0 { // increase branch misprediction
				out[i][j] += x[i][j] + y[i][j]
			}
			//}
		}
	}
}

func recursiveTrampoline(n int) {
	if n == 0 {
		for i := 0; i < 4000; i++ {
			combinationMatrix()
			//fib(100000)
		}
		//fib(1000000000)
		wg.Done()
		return
	}

	recursiveTrampoline(n - 1)
}

func burnCores() {
	// runtime.NumCPU()
	for k := 0; k < 1; k++ {
		wg.Add(1)
		go recursiveTrampoline(64) // make sure stack size is at max
		//go fib(10000000000)
	}
	wg.Wait()
}

func fib(n uint) uint {
	if n < 2 {
		return n
	}

	var a, b uint

	b = 1

	for n--; n > 0; n-- {
		a += b
		a, b = b, a
	}

	//fmt.Println(wg)
	//wg.Done()
	return b
}

func savePprofFromBuf(buf *bytes.Buffer, filename string) {
	pprof, err := pprofile.Parse(buf)
	if err != nil {
		panic(err)
	}

	f, err := os.Create(filename)
	if err != nil {
		panic(err)
	}
	// Write the profile to the file.
	if err := pprof.Write(f); err != nil {
		panic(err)
	}
}

const nwarmup = 1

func main() {
	var buf bytes.Buffer
	var p bool
	//u, _ := strconv.ParseUint(os.Args[1], 10, 64)
	niter, _ := strconv.Atoi(os.Args[1])

	p = false
	if len(os.Args) > 2 && os.Args[2] == "profile" {
		fmt.Println("cpu profiler is on")
		//runtime.SetCPUProfileRate(10)
		if err := pprof.StartCPUProfile(&buf); err != nil {
			panic(err)
		}
		p = true
	}

	// init
	x = initMatrix()
	y = initMatrix()
	//out = initMatrix()

	// warmup - excluded from elapsed time
	// fmt.Println("Warming up...")
	// for i := 0; i < nwarmup; i++ {
	// 	burnCores()
	// }
	// fmt.Println("Warmup done.")


	fmt.Println("Starting benchmark...")
	var totelapsed, mine, maxe time.Duration
	for i := 0; i < niter; i++ {
		t0 := time.Now()
		burnCores()
		elapsed := time.Since(t0)
		if elapsed < mine || mine == 0 {
			mine = elapsed
		}
		if elapsed > maxe || maxe == 0 {
			maxe = elapsed
		}
		totelapsed += elapsed
	}

	fmt.Println(fmt.Sprintf("Done. avg: %s, min:%s, max:%s", totelapsed/time.Duration(niter), mine, maxe))

	if p {
		pprof.StopCPUProfile()
		savePprofFromBuf(&buf, "p2.pb.gz")
	}
}
	package main

	import (
	"bytes"
	"fmt"
	"os"
	//"runtime"
	"runtime/pprof"
	"strconv"
	"sync"
	"time"

	pprofile "github.com/google/pprof/profile"
	)

	var wg sync.WaitGroup

	/*
	How I ran it:
	go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profile
	and
	go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profileoff
	and compare the elapsed time:
	here is an example result:
	pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profileoff ~/Desktop/p/my-go-playground/pprof-overhead
	Starting benchmark...
	Done. avg: 397.335839ms, min:391.252319ms, max:407.334504ms

	Performance counter stats for './main2 20 profileoff':

	8,202.59 msec task-clock # 1.001 CPUs utilized
	1,193 context-switches # 0.145 K/sec
	5 cpu-migrations # 0.001 K/sec
	3,870,412 page-faults # 0.472 M/sec
	36,335,529,823 cycles # 4.430 GHz (50.06%)
	97,220,434,963 instructions # 2.68 insn per cycle (62.58%)
	24,922,002,510 branches # 3038.307 M/sec (62.58%)
	8,874,605 branch-misses # 0.04% of all branches (62.58%)
	31,584,655,224 L1-dcache-loads # 3850.569 M/sec (62.58%)
	260,937,935 L1-dcache-load-misses # 0.83% of all L1-dcache hits (62.42%)
	9,954,231 LLC-loads # 1.214 M/sec (49.90%)
	4,206,466 LLC-load-misses # 42.26% of all LL-cache hits (49.90%)

	8.196124365 seconds time elapsed

	5.419365000 seconds user
	2.789732000 seconds sys


	pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profile ~/Desktop/p/my-go-playground/pprof-overhead
	cpu profiler is on
	Starting benchmark...
	Done. avg: 402.060052ms, min:393.520596ms, max:413.33551ms

	Performance counter stats for './main2 20 profile':

	8,347.27 msec task-clock # 0.981 CPUs utilized
	1,682 context-switches # 0.202 K/sec
	12 cpu-migrations # 0.001 K/sec
	3,871,087 page-faults # 0.464 M/sec
	36,663,980,903 cycles # 4.392 GHz (50.12%)
	97,169,632,931 instructions # 2.65 insn per cycle (62.62%)
	24,851,892,104 branches # 2977.247 M/sec (62.63%)
	9,278,597 branch-misses # 0.04% of all branches (62.66%)
	31,504,531,471 L1-dcache-loads # 3774.231 M/sec (62.66%)
	523,097,299 L1-dcache-load-misses # 1.66% of all L1-dcache hits (62.33%)
	19,507,961 LLC-loads # 2.337 M/sec (49.82%)
	4,100,867 LLC-load-misses # 21.02% of all LL-cache hits (49.79%)

	8.511315596 seconds time elapsed

	5.498909000 seconds user
	2.863598000 seconds sys
	*/

	/*
	> lscpu
	Architecture: x86_64
	CPU op-mode(s): 32-bit, 64-bit
	Byte Order: Little Endian
	CPU(s): 12
	On-line CPU(s) list: 0-11
	Thread(s) per core: 2
	Core(s) per socket: 6
	Socket(s): 1
	NUMA node(s): 1
	Vendor ID: GenuineIntel
	CPU family: 6
	Model: 158
	Model name: Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
	Stepping: 10
	CPU MHz: 1556.894
	CPU max MHz: 4600.0000
	CPU min MHz: 800.0000
	BogoMIPS: 6399.96
	Virtualization: VT-x
	L1d cache: 32K
	L1i cache: 32K
	L2 cache: 256K
	L3 cache: 12288K
	NUMA node0 CPU(s): 0-11
	Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d arch_capabilities
	*/

	const nrow = 400
	const ncol = 60

	var x [][]float32
	var y [][]float32
	var out [][]float32

	// - when out is local to the function it matters, because when it is global it will already be cached to the L2, I guess?
	// - increasing nrow/ncol > l2 cache mattered a bit. I saw consistent %2
	// - I get worse results when there are runtime.NumCPU() goroutines (more threads). So, a signal
	// fairly selects one of these threads and make the CPU cache dirty. However, I expect same to happen on a single thread
	// with GOMAXPROCS? But maybe: GOMAXPROCS already obsoletes the CPU cache because of concurrent runs of GC. So, I think
	// having more threads is a better idea. Try GOMAXPROCS with GOGC=off ...

	func initMatrix() [][]float32 {
	r := make([][]float32, nrow)
	for i := 0; i < nrow; i++ {
	r[i] = make([]float32, ncol)
	// for j := 0; j < ncol; j++ {
	// r[i][j] = float32(i + j)
	// }
	}
	return r
	}

	func combinationMatrix() {
	out := initMatrix()
	for i := 0; i < len(x); i++ {
	for j := 0; j < len(y[0]); j++ {
	//for k := 0; k < len(y); k++ {
	if out[i][j] == 0 { // increase branch misprediction
	out[i][j] += x[i][j] + y[i][j]
	}
	//}
	}
	}
	}

	func recursiveTrampoline(n int) {
	if n == 0 {
	for i := 0; i < 4000; i++ {
	combinationMatrix()
	//fib(100000)
	}
	//fib(1000000000)
	wg.Done()
	return
	}

	recursiveTrampoline(n - 1)
	}

	func burnCores() {
	// runtime.NumCPU()
	for k := 0; k < 1; k++ {
	wg.Add(1)
	go recursiveTrampoline(64) // make sure stack size is at max
	//go fib(10000000000)
	}
	wg.Wait()
	}

	func fib(n uint) uint {
	if n < 2 {
	return n
	}

	var a, b uint

	b = 1

	for n--; n > 0; n-- {
	a += b
	a, b = b, a
	}

	//fmt.Println(wg)
	//wg.Done()
	return b
	}

	func savePprofFromBuf(buf *bytes.Buffer, filename string) {
	pprof, err := pprofile.Parse(buf)
	if err != nil {
	panic(err)
	}

	f, err := os.Create(filename)
	if err != nil {
	panic(err)
	}
	// Write the profile to the file.
	if err := pprof.Write(f); err != nil {
	panic(err)
	}
	}

	const nwarmup = 1

	func main() {
	var buf bytes.Buffer
	var p bool
	//u, _ := strconv.ParseUint(os.Args[1], 10, 64)
	niter, _ := strconv.Atoi(os.Args[1])

	p = false
	if len(os.Args) > 2 && os.Args[2] == "profile" {
	fmt.Println("cpu profiler is on")
	//runtime.SetCPUProfileRate(10)
	if err := pprof.StartCPUProfile(&buf); err != nil {
	panic(err)
	}
	p = true
	}

	// init
	x = initMatrix()
	y = initMatrix()
	//out = initMatrix()

	// warmup - excluded from elapsed time
	// fmt.Println("Warming up...")
	// for i := 0; i < nwarmup; i++ {
	// burnCores()
	// }
	// fmt.Println("Warmup done.")


	fmt.Println("Starting benchmark...")
	var totelapsed, mine, maxe time.Duration
	for i := 0; i < niter; i++ {
	t0 := time.Now()
	burnCores()
	elapsed := time.Since(t0)
	if elapsed < mine \|\| mine == 0 {
	mine = elapsed
	}
	if elapsed > maxe \|\| maxe == 0 {
	maxe = elapsed
	}
	totelapsed += elapsed
	}

	fmt.Println(fmt.Sprintf("Done. avg: %s, min:%s, max:%s", totelapsed/time.Duration(niter), mine, maxe))

	if p {
	pprof.StopCPUProfile()
	savePprofFromBuf(&buf, "p2.pb.gz")
	}
	}