sunhay/cbounce_results.md

## cbounce_results.md

      
    Raw
  

              cbounce_results.md
            
          
    MacBook Pro (13-inch, 2017)
Intel(R) Core(TM) i7-7567U CPU @ 3.50GHz
16 GB 2133 MHz LPDDR3

$ go test -bench=.
goos: darwin
goarch: amd64

Benchmark1ThreadNoCacheLineBouncing-4    	100000000	        15.2 ns/op
Benchmark1ThreadCacheLineBouncing-4      	100000000	        14.5 ns/op

Benchmark2ThreadsNoCacheLineBouncing-4   	100000000	        14.5 ns/op
Benchmark2ThreadsCacheLineBouncing-4     	30000000	        55.2 ns/op

Benchmark4ThreadsNoCacheLineBouncing-4   	50000000	        26.9 ns/op
Benchmark4ThreadsCacheLineBouncing-4     	10000000	       123 ns/op

Benchmark8ThreadsNoCacheLineBouncing-4   	30000000	        47.8 ns/op
Benchmark8ThreadsCacheLineBouncing-4     	10000000	       245 ns/op

PASS
ok  	scratchpad/golang/cache-bouncing	13.087s


## cbounce_test.go
package cbounce

import (
	"sync"
	"testing"
	"sync/atomic"
)

var sink uint64

// Cache line bouncing via false sharing:
//   - False sharing occurs when threads on different processors modify variables that reside on the same cache line.
//   - This invalidates the cache line and forces an update, which hurts performance.
//   per https://software.intel.com/en-us/articles/avoiding-and-identifying-false-sharing-among-threads

// Finding out your cache line size
//   Mac:   $ sysctl hw.cachelinesize        -> 64
//   Linux: $ getconf LEVEL1_DCACHE_LINESIZE -> 64

type CacheBounce struct {
	m []sync.Mutex
}

var cb CacheBounce

func benchmarkCachelineBouncing(b *testing.B, numThreads int) {
	cb = CacheBounce{m: make([]sync.Mutex, numThreads)}
	wg := sync.WaitGroup{}

	b.ResetTimer()
	for i := 0; i < numThreads; i++ {
		wg.Add(1)
		go func(i int, c uint64) {
			for j := 0; j < b.N; j++ {
				cb.m[i].Lock()
				c++
				cb.m[i].Unlock()
			}
			atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
			wg.Done()
		}(i, 0)
	}
	wg.Wait()
}

type NoCacheBounce struct {
	m []PaddedMutex
}

type PaddedMutex struct {
	sync.Mutex  //   8 bytes
	_ [7]uint64 // + 7 * 8 bytes
}                   // = 64 bytes

var ncb NoCacheBounce

func benchNoCacheLineBouncing(b *testing.B, numThreads int) {
	ncb = NoCacheBounce{m: make([]PaddedMutex, numThreads)}
	wg := sync.WaitGroup{}

	b.ResetTimer()
	for i := 0; i < numThreads; i++ {
		wg.Add(1)
		go func(i int, c uint64) {
			for j := 0; j < b.N; j++ {
				ncb.m[i].Lock()
				c++
				ncb.m[i].Unlock()
			}
			atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
			wg.Done()
		}(i, 0)
	}

	wg.Wait()
}

func Benchmark1ThreadNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 1)
}

func Benchmark1ThreadCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 1)
}

func Benchmark2ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 2)
}

func Benchmark2ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 2)
}

func Benchmark4ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 4)
}

func Benchmark4ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 4)
}

func Benchmark8ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 8)
}

func Benchmark8ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 8)
}
	package cbounce

	import (
	"sync"
	"testing"
	"sync/atomic"
	)

	var sink uint64

	// Cache line bouncing via false sharing:
	// - False sharing occurs when threads on different processors modify variables that reside on the same cache line.
	// - This invalidates the cache line and forces an update, which hurts performance.
	// per https://software.intel.com/en-us/articles/avoiding-and-identifying-false-sharing-among-threads

	// Finding out your cache line size
	// Mac: $ sysctl hw.cachelinesize -> 64
	// Linux: $ getconf LEVEL1_DCACHE_LINESIZE -> 64

	type CacheBounce struct {
	m []sync.Mutex
	}

	var cb CacheBounce

	func benchmarkCachelineBouncing(b *testing.B, numThreads int) {
	cb = CacheBounce{m: make([]sync.Mutex, numThreads)}
	wg := sync.WaitGroup{}

	b.ResetTimer()
	for i := 0; i < numThreads; i++ {
	wg.Add(1)
	go func(i int, c uint64) {
	for j := 0; j < b.N; j++ {
	cb.m[i].Lock()
	c++
	cb.m[i].Unlock()
	}
	atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
	wg.Done()
	}(i, 0)
	}
	wg.Wait()
	}

	type NoCacheBounce struct {
	m []PaddedMutex
	}

	type PaddedMutex struct {
	sync.Mutex // 8 bytes
	_ [7]uint64 // + 7 * 8 bytes
	} // = 64 bytes

	var ncb NoCacheBounce

	func benchNoCacheLineBouncing(b *testing.B, numThreads int) {
	ncb = NoCacheBounce{m: make([]PaddedMutex, numThreads)}
	wg := sync.WaitGroup{}

	b.ResetTimer()
	for i := 0; i < numThreads; i++ {
	wg.Add(1)
	go func(i int, c uint64) {
	for j := 0; j < b.N; j++ {
	ncb.m[i].Lock()
	c++
	ncb.m[i].Unlock()
	}
	atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
	wg.Done()
	}(i, 0)
	}

	wg.Wait()
	}

	func Benchmark1ThreadNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 1)
	}

	func Benchmark1ThreadCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 1)
	}

	func Benchmark2ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 2)
	}

	func Benchmark2ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 2)
	}

	func Benchmark4ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 4)
	}

	func Benchmark4ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 4)
	}

	func Benchmark8ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 8)
	}

	func Benchmark8ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 8)
	}