Skip to content

Instantly share code, notes, and snippets.

@sunhay
Last active December 30, 2021 02:56
Show Gist options
  • Save sunhay/9e946ff157ffe1d2fa6499049c284651 to your computer and use it in GitHub Desktop.
Save sunhay/9e946ff157ffe1d2fa6499049c284651 to your computer and use it in GitHub Desktop.
cache line bouncing in golang
MacBook Pro (13-inch, 2017)
Intel(R) Core(TM) i7-7567U CPU @ 3.50GHz
16 GB 2133 MHz LPDDR3
$ go test -bench=.
goos: darwin
goarch: amd64

Benchmark1ThreadNoCacheLineBouncing-4    	100000000	        15.2 ns/op
Benchmark1ThreadCacheLineBouncing-4      	100000000	        14.5 ns/op

Benchmark2ThreadsNoCacheLineBouncing-4   	100000000	        14.5 ns/op
Benchmark2ThreadsCacheLineBouncing-4     	30000000	        55.2 ns/op

Benchmark4ThreadsNoCacheLineBouncing-4   	50000000	        26.9 ns/op
Benchmark4ThreadsCacheLineBouncing-4     	10000000	       123 ns/op

Benchmark8ThreadsNoCacheLineBouncing-4   	30000000	        47.8 ns/op
Benchmark8ThreadsCacheLineBouncing-4     	10000000	       245 ns/op

PASS
ok  	scratchpad/golang/cache-bouncing	13.087s
package cbounce
import (
"sync"
"testing"
"sync/atomic"
)
var sink uint64
// Cache line bouncing via false sharing:
// - False sharing occurs when threads on different processors modify variables that reside on the same cache line.
// - This invalidates the cache line and forces an update, which hurts performance.
// per https://software.intel.com/en-us/articles/avoiding-and-identifying-false-sharing-among-threads
// Finding out your cache line size
// Mac: $ sysctl hw.cachelinesize -> 64
// Linux: $ getconf LEVEL1_DCACHE_LINESIZE -> 64
type CacheBounce struct {
m []sync.Mutex
}
var cb CacheBounce
func benchmarkCachelineBouncing(b *testing.B, numThreads int) {
cb = CacheBounce{m: make([]sync.Mutex, numThreads)}
wg := sync.WaitGroup{}
b.ResetTimer()
for i := 0; i < numThreads; i++ {
wg.Add(1)
go func(i int, c uint64) {
for j := 0; j < b.N; j++ {
cb.m[i].Lock()
c++
cb.m[i].Unlock()
}
atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
wg.Done()
}(i, 0)
}
wg.Wait()
}
type NoCacheBounce struct {
m []PaddedMutex
}
type PaddedMutex struct {
sync.Mutex // 8 bytes
_ [7]uint64 // + 7 * 8 bytes
} // = 64 bytes
var ncb NoCacheBounce
func benchNoCacheLineBouncing(b *testing.B, numThreads int) {
ncb = NoCacheBounce{m: make([]PaddedMutex, numThreads)}
wg := sync.WaitGroup{}
b.ResetTimer()
for i := 0; i < numThreads; i++ {
wg.Add(1)
go func(i int, c uint64) {
for j := 0; j < b.N; j++ {
ncb.m[i].Lock()
c++
ncb.m[i].Unlock()
}
atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
wg.Done()
}(i, 0)
}
wg.Wait()
}
func Benchmark1ThreadNoCacheLineBouncing(b *testing.B) {
benchNoCacheLineBouncing(b, 1)
}
func Benchmark1ThreadCacheLineBouncing(b *testing.B) {
benchmarkCachelineBouncing(b, 1)
}
func Benchmark2ThreadsNoCacheLineBouncing(b *testing.B) {
benchNoCacheLineBouncing(b, 2)
}
func Benchmark2ThreadsCacheLineBouncing(b *testing.B) {
benchmarkCachelineBouncing(b, 2)
}
func Benchmark4ThreadsNoCacheLineBouncing(b *testing.B) {
benchNoCacheLineBouncing(b, 4)
}
func Benchmark4ThreadsCacheLineBouncing(b *testing.B) {
benchmarkCachelineBouncing(b, 4)
}
func Benchmark8ThreadsNoCacheLineBouncing(b *testing.B) {
benchNoCacheLineBouncing(b, 8)
}
func Benchmark8ThreadsCacheLineBouncing(b *testing.B) {
benchmarkCachelineBouncing(b, 8)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment