Skip to content

Instantly share code, notes, and snippets.

@awreece
Created April 15, 2012 22:20
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save awreece/2395102 to your computer and use it in GitHub Desktop.
Save awreece/2395102 to your computer and use it in GitHub Desktop.
Check Malloc for False Sharing
areece@areece-laptop:~/coding/gomalloc$ ./malloccheck --checkcache=true --procs=1
Malloc check ran with 1 procs for 1000000 iters of size 16
Cache line shared 0 (0.00%) times
Cache was reused 993536 (99.35%) times
areece@areece-laptop:~/coding/gomalloc$ ./malloccheck --checkcache=true --procs=2
Malloc check ran with 2 procs for 1000000 iters of size 16
Cache line shared 128186 (12.82%) times
Cache was reused 859014 (85.90%) times
areece@areece-laptop:~/coding/gomalloc$ ./malloccheck --checkcache=true --procs=3
Malloc check ran with 3 procs for 1000000 iters of size 16
Cache line shared 157340 (15.73%) times
Cache was reused 830371 (83.04%) times
areece@areece-laptop:~/coding/gomalloc$ ./malloccheck --checkcache=true --procs=4
Malloc check ran with 4 procs for 1000000 iters of size 16
Cache line shared 177130 (17.71%) times
Cache was reused 810390 (81.04%) times
for i in {1..4}; do
./malloccheck --checkcache=false --procs=$i --prof=out.prof;
go tool pprof --text malloccheck out.prof;
done
Total: 20 samples
8 40.0% 40.0% 20 100.0% main.do_thread
3 15.0% 55.0% 3 15.0% runtime.MCache_Free
2 10.0% 65.0% 12 60.0% runtime.makeslice
2 10.0% 75.0% 9 45.0% runtime.mallocgc
2 10.0% 85.0% 5 25.0% sweep
1 5.0% 90.0% 1 5.0% cachestats
1 5.0% 95.0% 10 50.0% makeslice1
1 5.0% 100.0% 1 5.0% runtime.MCache_Alloc
0 0.0% 100.0% 6 30.0% runtime.gc
0 0.0% 100.0% 20 100.0% schedunlock
Total: 23 samples
6 26.1% 26.1% 22 95.7% main.do_thread
4 17.4% 43.5% 13 56.5% runtime.mallocgc
3 13.0% 56.5% 3 13.0% scanblock
2 8.7% 65.2% 16 69.6% runtime.makeslice
1 4.3% 69.6% 14 60.9% makeslice1
1 4.3% 73.9% 2 8.7% runtime.MCache_Alloc
1 4.3% 78.3% 1 4.3% runtime.SizeToClass
1 4.3% 82.6% 1 4.3% runtime.casp
1 4.3% 87.0% 1 4.3% runtime.lock
1 4.3% 91.3% 2 8.7% runtime.markallocated
1 4.3% 95.7% 1 4.3% runtime.markspan
1 4.3% 100.0% 1 4.3% runtime.purgecachedstats
0 0.0% 100.0% 1 4.3% MCentral_Free
0 0.0% 100.0% 1 4.3% MCentral_Grow
0 0.0% 100.0% 1 4.3% ReleaseN
0 0.0% 100.0% 3 13.0% mark
0 0.0% 100.0% 1 4.3% runtime.MCache_Free
0 0.0% 100.0% 1 4.3% runtime.MCentral_AllocList
0 0.0% 100.0% 1 4.3% runtime.MCentral_FreeList
0 0.0% 100.0% 1 4.3% runtime.MHeap_Free
0 0.0% 100.0% 4 17.4% runtime.gc
0 0.0% 100.0% 22 95.7% schedunlock
0 0.0% 100.0% 1 4.3% sweep
Total: 23 samples
7 30.4% 30.4% 14 60.9% runtime.mallocgc
4 17.4% 47.8% 20 87.0% main.do_thread
2 8.7% 56.5% 16 69.6% runtime.makeslice
2 8.7% 65.2% 3 13.0% runtime.markallocated
1 4.3% 69.6% 2 8.7% MCentral_Free
1 4.3% 73.9% 14 60.9% makeslice1
1 4.3% 78.3% 3 13.0% runtime.MCache_Alloc
1 4.3% 82.6% 1 4.3% runtime.casp
1 4.3% 87.0% 1 4.3% runtime.markspan
1 4.3% 91.3% 1 4.3% runtime.memclr
1 4.3% 95.7% 1 4.3% runtime.unmarkspan
1 4.3% 100.0% 3 13.0% sweep
0 0.0% 100.0% 1 4.3% MCentral_Grow
0 0.0% 100.0% 2 8.7% ReleaseN
0 0.0% 100.0% 2 8.7% nextgandunlock
0 0.0% 100.0% 2 8.7% runtime.MCache_Free
0 0.0% 100.0% 1 4.3% runtime.MCentral_AllocList
0 0.0% 100.0% 2 8.7% runtime.MCentral_FreeList
0 0.0% 100.0% 1 4.3% runtime.clone
0 0.0% 100.0% 1 4.3% runtime.gc
0 0.0% 100.0% 2 8.7% runtime.gchelper
0 0.0% 100.0% 1 4.3% runtime.mcall
0 0.0% 100.0% 1 4.3% runtime.mstart
0 0.0% 100.0% 2 8.7% schedule
0 0.0% 100.0% 20 87.0% schedunlock
Total: 26 samples
6 23.1% 23.1% 22 84.6% main.do_thread
4 15.4% 38.5% 7 26.9% runtime.MCache_Free
4 15.4% 53.8% 4 15.4% runtime.casp
3 11.5% 65.4% 14 53.8% runtime.mallocgc
2 7.7% 73.1% 3 11.5% MCentral_Free
2 7.7% 80.8% 8 30.8% sweep
1 3.8% 84.6% 15 57.7% makeslice1
1 3.8% 88.5% 1 3.8% runtime.SizeToClass
1 3.8% 92.3% 16 61.5% runtime.makeslice
1 3.8% 96.2% 5 19.2% runtime.markallocated
1 3.8% 100.0% 1 3.8% runtime.xchg
0 0.0% 100.0% 3 11.5% ReleaseN
0 0.0% 100.0% 3 11.5% nextgandunlock
0 0.0% 100.0% 3 11.5% runtime.MCentral_FreeList
0 0.0% 100.0% 3 11.5% runtime.clone
0 0.0% 100.0% 5 19.2% runtime.gc
0 0.0% 100.0% 3 11.5% runtime.gchelper
0 0.0% 100.0% 1 3.8% runtime.lock
0 0.0% 100.0% 3 11.5% runtime.mstart
0 0.0% 100.0% 3 11.5% schedule
0 0.0% 100.0% 22 84.6% schedunlock
// malloc test for false sharing
package main
import (
"flag"
"fmt"
"log"
"os"
"runtime"
"runtime/pprof"
"sync"
"unsafe"
)
var (
allocSize int // Size of memory allocation. Flag.
checkCache bool // Run cache simulater. Flag.
logOut string // File to send log output. Flag.
niters int // Number of iterations to run over all threads. Flag.
nprocs int // Number of threads to run concurrently. Flag.
profOut string // File to send pprof output. Flag.
)
var emitlog bool // True iff logOut != "".
var cLock sync.Mutex // Lock for cache simulator.
var cache = make(map[uintptr]int) // Map of cache line to last tid to access it.
var reacquire int // Counter for number of times cache was valid.
var move int // Counter for number of times cache line was shared.
const cacheWidthBytes uint = 6 // A 64 byte cache
func init() {
flag.IntVar(&nprocs, "procs", 1, "Value to set GOMAXPROCS")
flag.IntVar(&allocSize, "size", 16, "Size of block to allocate")
flag.IntVar(&niters, "iters", 1000000, "Number of repetitions to make")
flag.StringVar(&profOut, "prof", "", "Emit profiling data to file")
flag.StringVar(&logOut, "log", "", "Emit log messages to file")
flag.BoolVar(&checkCache, "checkCache", true, "Check for cache sharing")
}
// Update the cache to know thread tid got pointer ptr.
func updateCache(tid int, ptr uintptr) {
// Round to cache line, and make no longer valid ptr so gc will free.
cacheLine := uintptr(ptr >> cacheWidthBytes)
// Don't defer Unlock because we want to avoid holding lock while
// printing log messages below.
cLock.Lock()
t, p := cache[cacheLine]
cache[cacheLine] = tid
if p {
if t == tid {
reacquire++
} else {
move++
}
}
cLock.Unlock()
if emitlog {
if !p {
log.Printf("%d] got %#x", tid, ptr)
} else if t != tid {
log.Printf("%d] got %#x (previously %d)", tid, ptr, t)
} else {
log.Printf("%d] regot %#x", tid, ptr)
}
}
}
// Convert byte slice to raw pointer.
func ptr2uintptr(ptr []byte) uintptr {
return uintptr(unsafe.Pointer(&ptr[0]))
}
// Run worker thread. Thread locks an os thread then makes a bunch of memory
// allocations and writes, hopefully to generate sharing of allocated blocks.
func do_thread(tid int, niters int, wg *sync.WaitGroup) {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
for iter := 0; iter < niters; iter++ {
b := make([]byte, allocSize)
if checkCache {
updateCache(tid, ptr2uintptr(b))
}
// Write a bunch to b.
for i, _ := range b {
b[i] = byte(i)
}
// Intentionally drop reference to b.
b = nil
}
wg.Done()
}
func printCacheStats() {
fmt.Printf("Malloc check ran with %d procs for %d iters of size %d\n",
nprocs, niters, allocSize)
fmt.Printf("\tCache line shared %d (%.2f%%) times\n",
move, float32(move)/float32(niters)*100)
fmt.Printf("\tCache was reused %d (%.2f%%) times\n",
reacquire, float32(reacquire)/float32(niters)*100)
}
func main() {
flag.Parse()
if profOut != "" {
if f, err := os.Create(profOut); err != nil {
log.Fatal(err)
} else {
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
}
if logOut != "" {
if f, err := os.Create(logOut); err != nil {
log.Fatal(err)
} else {
emitlog = true
log.SetOutput(f)
log.SetFlags(log.Lmicroseconds)
}
}
runtime.GOMAXPROCS(nprocs)
runtime.GC() // clean up garbage from init
var wg sync.WaitGroup
for i := 0; i < nprocs; i++ {
wg.Add(1)
go do_thread(i, niters/nprocs, &wg)
}
wg.Wait()
if checkCache {
printCacheStats()
}
}
for i in {1..4}; do
perf stat -e cache-misses -e migrations ./malloccheck --procs=$i --checkcache=false;
done
Performance counter stats for './malloccheck --procs=1 --checkcache=false':
13,281 cache-misses
8 migrations
0.214455000 seconds time elapsed
Performance counter stats for './malloccheck --procs=2 --checkcache=false':
15,124 cache-misses
41 migrations
0.175444727 seconds time elapsed
Performance counter stats for './malloccheck --procs=3 --checkcache=false':
18,581 cache-misses
70 migrations
0.164377330 seconds time elapsed
Performance counter stats for './malloccheck --procs=4 --checkcache=false':
19,848 cache-misses
74 migrations
0.159023927 seconds time elapsed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment