Skip to content

Instantly share code, notes, and snippets.

@simonhf
Last active June 2, 2021 00:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simonhf/351f91aae5366081b7742d25205f7534 to your computer and use it in GitHub Desktop.
Save simonhf/351f91aae5366081b7742d25205f7534 to your computer and use it in GitHub Desktop.
Golang profiling techniques

Learning about Golang go-routines, LWPs, async pre-emption, and timing

Create a script which is forced to execute non-cooperative go-routines one at a time because they are only running on a single LWP

  • Try to minimize the number of concurrent threads / LWP via GOMAXPROCS.
  • In theory, the go-routines should execute one after the other since they are non-cooperative?
  • Note: On a 16 core box, by default Golang spots the 16 cores available and GOMAXPROCS is set to 16.
$ cat blocking.go 
package main

import (
    "os"
    "fmt"
    "time"
    "strconv"
    "syscall"
    "runtime"
    "runtime/pprof"
    "runtime/trace"
)

var timeStart = time.Now()

func f(from string, score int) {
    t1 := time.Now()
    fmt.Printf("%f %6d=tid f(%s, %6d) // enter; .NumCPU()=%d .GOMAXPROCS(0)=%d .NumGoroutine()=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), from, score, runtime.NumCPU(), runtime.GOMAXPROCS(0), runtime.NumGoroutine())
    for i := 1; i <= score; i++ {
        for z := 1; z <= score; z++ {
        }
        if 0 == (i % 20000) {
            fmt.Printf("%f %6d=tid f(%s, %6d) // i=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), from, score, i)
        }
    }
    fmt.Printf("%f %6d=tid f(%s, %6d) // leave in %f; .NumCPU()=%d .GOMAXPROCS(0)=%d .NumGoroutine()=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), from, score, time.Now().Sub(t1).Seconds(), runtime.NumCPU(), runtime.GOMAXPROCS(0), runtime.NumGoroutine())
}   

func main() {
    t1 := time.Now()
    fmt.Printf("%f %6d=tid main() // enter; .NumCPU()=%d .GOMAXPROCS(0)=%d .NumGoroutine()=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), runtime.NumCPU(), runtime.GOMAXPROCS(0), runtime.NumGoroutine())

    if _, exists := os.LookupEnv("USE_PPROF"); exists {
        fmt.Printf("%f %6d=tid main() // enabling pprof\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
        file, _ := os.Create("pprof.out")
        defer file.Close()
        pprof.StartCPUProfile(file)
        defer pprof.StopCPUProfile()
    }

    if _, exists := os.LookupEnv("USE_TRACE"); exists {
        fmt.Printf("%f %6d=tid main() // enabling trace\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
        file, _ := os.Create("trace.out")
        defer file.Close()
        trace.Start(file)
        defer trace.Stop()
    }

    fmt.Printf("%f %6d=tid main() // .GOMAXPROCS(0)=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), runtime.GOMAXPROCS(0))
    goMaxProcsWanted, _ := strconv.Atoi(os.Getenv("GOMAXPROCS_WANTED"))
    runtime.GOMAXPROCS(goMaxProcsWanted)
    fmt.Printf("%f %6d=tid main() // .GOMAXPROCS(0)=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), runtime.GOMAXPROCS(0))
    
    fmt.Printf("%f %6d=tid main() // sleeping\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
    time.Sleep(3 * time.Second)
    fmt.Printf("%f %6d=tid main() // waking\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
    
    f("direct", 120000)
    go f("foo", 100000)
    go f("bar",  80000)
    go f("baz",  60000)

    go func(msg string) {
        fmt.Printf("%f %6d=tid func(%s)\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), msg)
    }("going without loop")

    fmt.Printf("%f %6d=tid main() // type in word + hit return\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
    var input string
    fmt.Scanln(&input)
    fmt.Printf("%f %6d=tid main() // input=%s\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), input)
    fmt.Printf("%f %6d=tid main() // leave in %f; .NumCPU()=%d .GOMAXPROCS(0)=%d .NumGoroutine()=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), time.Now().Sub(t1).Seconds(), runtime.NumCPU(), runtime.GOMAXPROCS(0), runtime.NumGoroutine())
}

// based on https://stackoverflow.com/questions/46628053/how-go-runtime-checks-if-goroutine-is-blocked

Experiment 1: Run on Linux 16 core box with GOMAXPROCS=3

  • Set GOMAXPROCS explicitly to 3, meaning that max 3 go-routines can run concurrently.
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | GOMAXPROCS_WANTED=3 ./blocking 2>&1 | perl -lane 'use Time::HiRes; if($_ ne $last){ if($dups > 1){ printf qq[%f x%u duplicates\n], $last_ts, $dups; } printf qq[%f %s\n], Time::HiRes::time(), $_; $dups = 1; }else{ $last_ts = Time::HiRes::time(); $dups ++; } $last=$_;' & sleep 2 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 4 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 3 ; ps -eLF | egrep "([b]locking|[P]ID)"
1622239372.823516 0.000001   6269=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
1622239372.823551 0.000073   6269=tid main() // .GOMAXPROCS(0)=16
1622239372.823555 0.000100   6269=tid main() // .GOMAXPROCS(0)=3
1622239372.823564 0.000102   6269=tid main() // sleeping
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6269  6186  6269  0    5 175850 7028   6 22:02 pts/1    00:00:00 ./blocking
simon_h+  6269  6186  6271  0    5 175850 7028   1 22:02 pts/1    00:00:00 ./blocking
simon_h+  6269  6186  6272  0    5 175850 7028   0 22:02 pts/1    00:00:00 ./blocking
simon_h+  6269  6186  6273  0    5 175850 7028   4 22:02 pts/1    00:00:00 ./blocking
simon_h+  6269  6186  6274  0    5 175850 7028  13 22:02 pts/1    00:00:00 ./blocking
1622239375.820075 3.000230   6272=tid main() // waking
1622239375.820116 3.000258   6272=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=1
1622239376.546180 3.726323   6272=tid f(direct, 120000) // i=20000
1622239377.271699 4.451874   6272=tid f(direct, 120000) // i=40000
1622239377.983622 5.163752   6272=tid f(direct, 120000) // i=60000
1622239378.709423 5.889558   6272=tid f(direct, 120000) // i=80000
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6269  6186  6269  0    5 175850 7028   6 22:02 pts/1    00:00:00 ./blocking
simon_h+  6269  6186  6271  0    5 175850 7028   5 22:02 pts/1    00:00:00 ./blocking
simon_h+  6269  6186  6272 50    5 175850 7028   0 22:02 pts/1    00:00:03 ./blocking
simon_h+  6269  6186  6273  0    5 175850 7028   4 22:02 pts/1    00:00:00 ./blocking
simon_h+  6269  6186  6274  0    5 175850 7028  13 22:02 pts/1    00:00:00 ./blocking
1622239379.432804 6.612951   6272=tid f(direct, 120000) // i=100000
1622239380.154543 7.334697   6272=tid f(direct, 120000) // i=120000
1622239380.154591 7.334737   6272=tid f(direct, 120000) // leave in 4.334479; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=1
1622239380.154597 7.334769   6272=tid main() // type in word + hit return
1622239380.154689 7.334821   6269=tid f(bar,  80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=5
1622239380.154722 7.334852   6274=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=5
1622239380.156020 7.336200   6273=tid func(going without loop)
1622239380.156043 7.336230   6273=tid f(baz,  60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=4
1622239380.515295 7.695442   6273=tid f(baz,  60000) // i=20000
1622239380.631283 7.811423   6274=tid f(bar,  80000) // i=20000
1622239380.750385 7.930509   6269=tid f(foo, 100000) // i=20000
1622239380.880283 8.059341   6269=tid f(baz,  60000) // i=40000
1622239381.121222 8.301370   6273=tid f(bar,  80000) // i=40000
1622239381.240409 8.420556   6269=tid f(baz,  60000) // i=60000
1622239381.240453 8.420593   6269=tid f(baz,  60000) // leave in 1.084364; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=4
1622239381.357653 8.537805   6273=tid f(foo, 100000) // i=40000
1622239381.597033 8.777167   6273=tid f(bar,  80000) // i=60000
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6269  6186  6269 12    5 175850 7028   6 22:02 pts/1    00:00:01 ./blocking
simon_h+  6269  6186  6271  0    5 175850 7028   9 22:02 pts/1    00:00:00 ./blocking
simon_h+  6269  6186  6272 48    5 175850 7028   0 22:02 pts/1    00:00:04 ./blocking
simon_h+  6269  6186  6273 19    5 175850 7028   7 22:02 pts/1    00:00:01 ./blocking
simon_h+  6269  6186  6274 19    5 175850 7028  13 22:02 pts/1    00:00:01 ./blocking
1622239381.965264 9.145415   6273=tid f(foo, 100000) // i=60000
1622239382.092083 9.272224   6273=tid f(bar,  80000) // i=80000
1622239382.092124 9.272259   6273=tid f(bar,  80000) // leave in 1.937439; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=3
1622239382.560368 9.740509   6274=tid f(foo, 100000) // i=80000
1622239383.169018 10.349182   6274=tid f(foo, 100000) // i=100000
1622239383.169062 10.349215   6274=tid f(foo, 100000) // leave in 3.014363; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
1622239387.819274 14.999441   6272=tid main() // input=word!
1622239387.819318 14.999461   6272=tid main() // leave in 14.999461; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=1
  • Observation: Even though we set GOMAXPROCS to 3, ps always reports 5 LWPs for the process. Why?

Experiment 2: Run on Linux 16 core box with GOMAXPROCS=1

  • Set GOMAXPROCS explicitly to 1, meaning that max 1 go-routine can run at a time.
  • Observation: Even though we set GOMAXPROCS to 3, the last ps shows 4x threads chewing CPU. Why?
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | GOMAXPROCS_WANTED=1 ./blocking 2>&1 | perl -lane 'use Time::HiRes; if($_ ne $last){ if($dups > 1){ printf qq[%f x%u duplicates\n], $last_ts, $dups; } printf qq[%f %s\n], Time::HiRes::time(), $_; $dups = 1; }else{ $last_ts = Time::HiRes::time(); $dups ++; } $last=$_;' & sleep 2 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 4 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 3 ; ps -eLF | egrep "([b]locking|[P]ID)"
1622239518.800515 0.000001   6356=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
1622239518.800548 0.000037   6356=tid main() // .GOMAXPROCS(0)=16
1622239518.800557 0.000072   6356=tid main() // .GOMAXPROCS(0)=1
1622239518.800585 0.000076   6356=tid main() // sleeping
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6356  6283  6356  0    5 175850 5788   9 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6358  0    5 175850 5788  15 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6359  0    5 175850 5788  10 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6360  0    5 175850 5788  11 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6361  0    5 175850 5788   6 22:05 pts/1    00:00:00 ./blocking
1622239521.796662 3.000204   6359=tid main() // waking
1622239521.796703 3.000229   6359=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239522.521093 3.724637   6359=tid f(direct, 120000) // i=20000
1622239523.235430 4.438922   6359=tid f(direct, 120000) // i=40000
1622239523.958976 5.162500   6359=tid f(direct, 120000) // i=60000
1622239524.683917 5.887451   6359=tid f(direct, 120000) // i=80000
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6356  6283  6356  0    5 175850 5788  11 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6358  0    5 175850 5788   6 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6359 49    5 175850 5788   9 22:05 pts/1    00:00:02 ./blocking
simon_h+  6356  6283  6360  0    5 175850 5788  11 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6361  0    5 175850 5788   6 22:05 pts/1    00:00:00 ./blocking
1622239525.407411 6.610946   6359=tid f(direct, 120000) // i=100000
1622239526.119309 7.322831   6359=tid f(direct, 120000) // i=120000
1622239526.119351 7.322869   6359=tid f(direct, 120000) // leave in 4.322641; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239526.119359 7.322890   6359=tid main() // type in word + hit return
1622239526.130854 7.334395   6356=tid func(going without loop)
1622239526.136426 7.334472   6361=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239526.146608 7.334523   6361=tid f(bar,  80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239526.146673 7.334536   6361=tid f(baz,  60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239527.226219 8.429745   6361=tid f(baz,  60000) // i=20000
1622239527.571185 8.774705   6361=tid f(bar,  80000) // i=20000
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6356  6283  6356  0    5 175850 5788   7 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6358  0    5 175850 5788  12 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6359 48    5 175850 5788   9 22:05 pts/1    00:00:04 ./blocking
simon_h+  6356  6283  6360  0    5 175850 5788  11 22:05 pts/1    00:00:00 ./blocking
simon_h+  6356  6283  6361 18    5 175850 5788   6 22:05 pts/1    00:00:01 ./blocking
1622239527.908274 9.111738   6361=tid f(foo, 100000) // i=20000
1622239528.311557 9.515085   6361=tid f(baz,  60000) // i=40000
1622239529.021321 10.224857   6361=tid f(bar,  80000) // i=40000
1622239529.398300 10.601836   6361=tid f(baz,  60000) // i=60000
1622239529.398340 10.601869   6361=tid f(baz,  60000) // leave in 3.267334; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239529.608226 10.811770   6361=tid f(foo, 100000) // i=40000
1622239530.115676 11.319215   6361=tid f(bar,  80000) // i=60000
1622239530.796553 12.000074   6361=tid f(foo, 100000) // i=60000
1622239531.101278 12.304796   6361=tid f(bar,  80000) // i=80000
1622239531.101320 12.304834   6361=tid f(bar,  80000) // leave in 4.970311; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239531.540955 12.744472   6361=tid f(foo, 100000) // i=80000
1622239532.152082 13.355598   6361=tid f(foo, 100000) // i=100000
1622239532.152125 13.355641   6361=tid f(foo, 100000) // leave in 6.021169; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
1622239533.796099 14.999647   6359=tid main() // input=word!
1622239533.796149 14.999667   6359=tid main() // leave in 14.999667; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
  • Observation: Even though we set GOMAXPROCS to 1, ps always reports 5 LWPs for the process. Why?
  • Observation: Even though we set GOMAXPROCS to 1, the last ps shows 2x threads chewing CPU. Why?
    • Note: Total CPU usage does not go above 100% though. Maybe Golang switches LWPs but does not use them concurrently?
  • Observation: How do the non-cooperative foo, bar, and baz run 'concurrently' if GOMAXPROCS is 1?
  • It turns out that Go version 1.14 introduced asynchronous preemption::
Goroutines are now asynchronously preemptible. As a result, loops without function calls no longer potentially deadlock the scheduler or significantly delay garbage collection. This is supported on all platforms except  `windows/arm`,  `darwin/arm`,  `js/wasm`, and  `plan9/*`.
  • The original Golang asynchronous preemption proposal is described here [1]. And talked about here [2].

[1] https://go.googlesource.com/proposal/+/master/design/24543-non-cooperative-preemption.md [2] https://stackoverflow.com/questions/64113394/at-which-point-a-goroutine-can-yield

Experiment 3: Run on Linux 16 core box with GOMAXPROCS=1 & strace

  • Set GOMAXPROCS explicitly to 1, meaning that max 1 go-routine can run at a time.

  • Golang uses SIGURG is used to preempt functions at various points in execution, and these signals can be viewed by running the above code with strace:

$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | GOMAXPROCS_WANTED=1 strace -f -e 'trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait' ./blocking 2>&1 | perl -lane 'use Time::HiRes; if($_ ne $last){ if($dups > 1){ printf qq[%f x%u duplicates\n], $last_ts, $dups; } printf qq[%f %s\n], Time::HiRes::time(), $_; $dups = 1; }else{ $last_ts = Time::HiRes::time(); $dups ++; } $last=$_;' & sleep 2 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 4 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 3 ; ps -eLF | egrep "([b]locking|[P]ID)"
1622239615.195494 execve("./blocking", ["./blocking"], 0x7ffcd316c658 /* 24 vars */) = 0
1622239615.195529 sched_getaffinity(0, 8192, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) = 64
1622239615.195544 openat(AT_FDCWD, "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", O_RDONLY) = 3
1622239615.195555 read(3, "2097152\n", 20)                = 8
1622239615.195560 close(3)                                = 0
1622239615.195568 uname({sysname="Linux", nodename="benchnet-dev-002", ...}) = 0
1622239615.195575 mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f462f8000
1622239615.195581 mmap(NULL, 131072, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f462d8000
1622239615.195587 mmap(NULL, 1048576, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f461d8000
1622239615.195593 mmap(NULL, 8388608, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f459d8000
1622239615.195599 mmap(NULL, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f419d8000
1622239615.195605 mmap(NULL, 536870912, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f219d8000
1622239615.195611 mmap(0xc000000000, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xc000000000
1622239615.195617 mmap(0xc000000000, 67108864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xc000000000
1622239615.195624 mmap(NULL, 33554432, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f9d8000
1622239615.195633 mmap(NULL, 2165768, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f7c7000
1622239615.195642 mmap(0x7f6f462d8000, 131072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f462d8000
1622239615.195651 mmap(0x7f6f46258000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f46258000
1622239615.195660 mmap(0x7f6f45dde000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f45dde000
1622239615.195670 mmap(0x7f6f43a08000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f43a08000
1622239615.195680 mmap(0x7f6f31b58000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f31b58000
1622239615.195689 mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f6c7000
1622239615.195699 mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f6b7000
1622239615.195708 mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f6a7000
1622239615.195717 clone(strace: Process 6459 attached
1622239615.195725 child_stack=0xc000062000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6459
1622239615.195734 [pid  6458] clone(strace: Process 6460 attached
1622239615.195742 child_stack=0xc000064000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6460
1622239615.195750 [pid  6458] clone(child_stack=0xc00005e000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6461
1622239615.195759 strace: Process 6461 attached
1622239615.195770 [pid  6460] mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f667000
1622239615.195778 [pid  6460] clone(strace: Process 6462 attached
1622239615.195786 child_stack=0xc000092000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6462
1622239615.195795 [pid  6458] clone(child_stack=0xc000060000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6463
1622239615.195802 strace: Process 6463 attached
1622239615.195897 [pid  6458] readlinkat(AT_FDCWD, "/proc/self/exe", "/home/simon_hardy_francis_dapper"..., 128) = 72
1622239615.195925 [pid  6458] fcntl(0, F_GETFL)           = 0 (flags O_RDONLY)
1622239615.195986 [pid  6458] fcntl(1, F_GETFL)           = 0x1 (flags O_WRONLY)
1622239615.196013 [pid  6458] fcntl(2, F_GETFL)           = 0x1 (flags O_WRONLY)
1622239615.196169 0.000001   6458=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
1622239615.196212 0.000090   6458=tid main() // .GOMAXPROCS(0)=16
1622239615.196291 [pid  6458] mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...>
1622239615.196326 [pid  6459] sched_yield( <unfinished ...>
1622239615.196336 [pid  6458] <... mmap resumed> )        = 0x7f6f1f627000
1622239615.196347 [pid  6459] <... sched_yield resumed> ) = 0
1622239615.196523 0.000400   6458=tid main() // .GOMAXPROCS(0)=1
1622239615.196623 0.000501   6462=tid main() // sleeping
1622239615.196664 [pid  6462] epoll_create1(EPOLL_CLOEXEC) = 3
1622239615.196705 [pid  6462] pipe2([4, 5], O_NONBLOCK|O_CLOEXEC) = 0
1622239615.196757 [pid  6462] epoll_ctl(3, EPOLL_CTL_ADD, 4, {EPOLLIN, {u32=5962384, u64=5962384}}) = 0
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6454  6370  6454  0    1  1565  3320  10 22:06 pts/1    00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+  6458  6454  6458  0    6 175850 5852   6 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6459  0    6 175850 5852   9 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6460  0    6 175850 5852   7 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6461  0    6 175850 5852   0 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6462  0    6 175850 5852   3 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6463  0    6 175850 5852   5 22:06 pts/1    00:00:00 ./blocking
1622239618.197036 3.000820   6458=tid main() // waking
1622239618.197123 3.000954   6463=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239618.208377 [pid  6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239618.919412 x36 duplicates
1622239618.922672 3.726441   6463=tid f(direct, 120000) // i=20000
1622239618.939843 [pid  6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239619.635284 x35 duplicates
1622239619.636707 4.440469   6463=tid f(direct, 120000) // i=40000
1622239619.655789 [pid  6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239620.351033 x35 duplicates
1622239620.359982 5.163726   6463=tid f(direct, 120000) // i=60000
1622239620.371477 [pid  6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239621.067251 x35 duplicates
1622239621.084177 5.887915   6463=tid f(direct, 120000) // i=80000
1622239621.087738 [pid  6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6454  6370  6454  0    1  1565  3320   3 22:06 pts/1    00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+  6458  6454  6458  0    6 175850 5852   6 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6459  0    6 175850 5852   0 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6460  0    6 175850 5852   7 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6461  0    6 175850 5852   0 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6462  0    6 175850 5852   3 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6463 49    6 175850 5852   5 22:06 pts/1    00:00:02 ./blocking
1622239621.803211 x36 duplicates
1622239621.807145 6.610914   6463=tid f(direct, 120000) // i=100000
1622239621.823699 [pid  6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239622.520434 x35 duplicates
1622239622.522924 7.326583   6463=tid f(direct, 120000) // i=120000
1622239622.522990 7.326868   6463=tid f(direct, 120000) // leave in 4.325926; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239622.523041 7.326937   6463=tid main() // type in word + hit return
1622239622.540928 [pid  6463] read(0, 7.344752   6462=tid func(going without loop)
1622239622.540969 7.344825   6462=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239622.562307  <unfinished ...>
1622239622.562346 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239622.562451 7.366261   6462=tid f(bar,  80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239622.577875 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239622.578027 7.381881   6462=tid f(baz,  60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239622.598332 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239623.643098 x52 duplicates
1622239623.659040 8.462803   6462=tid f(baz,  60000) // i=20000
1622239623.663517 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239623.991278 x17 duplicates
1622239624.007362 8.811113   6462=tid f(bar,  80000) // i=20000
1622239624.011778 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6454  6370  6454  0    1  1565  3320   5 22:06 pts/1    00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+  6458  6454  6458  0    6 175850 5852   6 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6459  0    6 175850 5852   4 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6460  0    6 175850 5852   7 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6461  0    6 175850 5852   0 22:06 pts/1    00:00:00 ./blocking
simon_h+  6458  6454  6462 18    6 175850 5852   2 22:06 pts/1    00:00:01 ./blocking
simon_h+  6458  6454  6463 47    6 175850 5852   8 22:06 pts/1    00:00:04 ./blocking
1622239624.278122 x14 duplicates
1622239624.284524 9.088293   6462=tid f(foo, 100000) // i=20000
1622239624.298615 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239624.769219 x24 duplicates
1622239624.781013 9.584756   6462=tid f(baz,  60000) // i=40000
1622239624.789645 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239625.485786 x35 duplicates
1622239625.494236 10.297982   6462=tid f(bar,  80000) // i=40000
1622239625.506155 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239625.894760 x20 duplicates
1622239625.898915 10.702669   6462=tid f(baz,  60000) // i=60000
1622239625.898934 10.702797   6462=tid f(baz,  60000) // leave in 3.320944; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239625.915290 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239625.976798 x4 duplicates
1622239625.978271 10.782071   6462=tid f(foo, 100000) // i=40000
1622239625.997289 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239626.570206 x29 duplicates
1622239626.588907 11.392650   6462=tid f(bar,  80000) // i=60000
1622239626.590635 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239627.163053 x29 duplicates
1622239627.173564 11.977313   6462=tid f(foo, 100000) // i=60000
1622239627.183497 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239627.552142 x19 duplicates
1622239627.562964 12.366691   6462=tid f(bar,  80000) // i=80000
1622239627.562975 12.366820   6462=tid f(bar,  80000) // leave in 5.000588; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239627.582762 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239627.951767 x19 duplicates
1622239627.968117 12.771858   6462=tid f(foo, 100000) // i=80000
1622239627.972193 [pid  6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239628.566548 x30 duplicates
1622239628.577700 13.381432   6462=tid f(foo, 100000) // i=100000
1622239628.577750 13.381596   6462=tid f(foo, 100000) // leave in 6.036797; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
1622239630.189976 [pid  6463] <... read resumed> "w", 1)  = 1
1622239630.190178 [pid  6463] read(0, "o", 1)             = 1
1622239630.190212 [pid  6463] read(0, "r", 1)             = 1
1622239630.190339 [pid  6462] read(0, "d", 1)             = 1
1622239630.190381 [pid  6462] read(0, "!", 1)             = 1
1622239630.190423 [pid  6462] read(0, "\n", 1)            = 1
1622239630.190500 14.994351   6462=tid main() // input=word!
1622239630.190565 14.994430   6462=tid main() // leave in 14.994457; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239630.190665 [pid  6462] exit_group(0)               = ?
1622239630.190714 [pid  6463] +++ exited with 0 +++
1622239630.190763 [pid  6462] +++ exited with 0 +++
1622239630.190846 [pid  6461] +++ exited with 0 +++
1622239630.190853 [pid  6460] +++ exited with 0 +++
1622239630.190982 [pid  6459] +++ exited with 0 +++
1622239630.190991 +++ exited with 0 +++

Experiment 4: Run on Linux 16 core box with GOMAXPROCS=1 & strace & GODEBUG=asyncpreemptoff=1

  • Re-running with GODEBUG=asyncpreemptoff=1 [1] to switch off async preemption:
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | GODEBUG=asyncpreemptoff=1 GOMAXPROCS_WANTED=1 strace -f -e 'trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait' ./blocking 2>&1 | perl -lane 'use Time::HiRes; if($_ ne $last){ if($dups > 1){ printf qq[%f x%u duplicates\n], $last_ts, $dups; } printf qq[%f %s\n], Time::HiRes::time(), $_; $dups = 1; }else{ $last_ts = Time::HiRes::time(); $dups ++; } $last=$_;' & sleep 2 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 4 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 3 ; ps -eLF | egrep "([b]locking|[P]ID)"
1622239745.443929 execve("./blocking", ["./blocking"], 0x7fffc7bd8f28 /* 25 vars */) = 0
1622239745.443965 sched_getaffinity(0, 8192, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) = 64
1622239745.443970 openat(AT_FDCWD, "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", O_RDONLY) = 3
1622239745.443973 read(3, "2097152\n", 20)                = 8
1622239745.443977 close(3)                                = 0
1622239745.443986 uname({sysname="Linux", nodename="benchnet-dev-002", ...}) = 0
1622239745.443991 mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4eb02c000
1622239745.444003 mmap(NULL, 131072, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4eb00c000
1622239745.444034 mmap(NULL, 1048576, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4eaf0c000
1622239745.444044 mmap(NULL, 8388608, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4ea70c000
1622239745.444053 mmap(NULL, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4e670c000
1622239745.444063 mmap(NULL, 536870912, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c670c000
1622239745.444070 mmap(0xc000000000, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xc000000000
1622239745.444078 mmap(0xc000000000, 67108864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xc000000000
1622239745.444087 mmap(NULL, 33554432, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c470c000
1622239745.444095 mmap(NULL, 2165768, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c44fb000
1622239745.444103 mmap(0x7fa4eb00c000, 131072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4eb00c000
1622239745.444113 mmap(0x7fa4eaf8c000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4eaf8c000
1622239745.444123 mmap(0x7fa4eab12000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4eab12000
1622239745.444133 mmap(0x7fa4e873c000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4e873c000
1622239745.444143 mmap(0x7fa4d688c000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4d688c000
1622239745.444152 mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c43fb000
1622239745.444162 mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c43eb000
1622239745.444172 mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c43db000
1622239745.444185 clone(strace: Process 6553 attached
1622239745.444200 child_stack=0xc000064000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6553
1622239745.444263 [pid  6552] clone(child_stack=0xc000066000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6554
1622239745.444276 strace: Process 6554 attached
1622239745.444382 [pid  6552] clone(strace: Process 6555 attached
1622239745.444404 child_stack=0xc000060000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6555
1622239745.444533 [pid  6554] mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c439b000
1622239745.444637 [pid  6554] clone( <unfinished ...>
1622239745.444663 [pid  6552] readlinkat(AT_FDCWD, "/proc/self/exe",  <unfinished ...>
1622239745.444675 [pid  6554] <... clone resumed> child_stack=0xc000092000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6556
1622239745.444686 strace: Process 6556 attached
1622239745.444705 [pid  6552] <... readlinkat resumed> "/home/simon_hardy_francis_dapper"..., 128) = 72
1622239745.444726 [pid  6552] fcntl(0, F_GETFL)           = 0 (flags O_RDONLY)
1622239745.444757 [pid  6552] mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c435b000
1622239745.444784 [pid  6552] fcntl(1, F_GETFL)           = 0x1 (flags O_WRONLY)
1622239745.444803 [pid  6552] fcntl(2, F_GETFL)           = 0x1 (flags O_WRONLY)
1622239745.444958 0.000001   6552=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
1622239745.444994 0.000079   6552=tid main() // .GOMAXPROCS(0)=16
1622239745.445106 0.000162   6552=tid main() // .GOMAXPROCS(0)=1
1622239745.445147 0.000234   6552=tid main() // sleeping
1622239745.445186 [pid  6552] epoll_create1(EPOLL_CLOEXEC) = 3
1622239745.445234 [pid  6552] pipe2([4, 5], O_NONBLOCK|O_CLOEXEC) = 0
1622239745.445254 [pid  6552] epoll_ctl(3, EPOLL_CTL_ADD, 4, {EPOLLIN, {u32=5962384, u64=5962384}}) = 0
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6548  6475  6548  0    1  1565  3428   6 22:09 pts/1    00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+  6552  6548  6552  0    5 175850 7028   7 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6553  0    5 175850 7028   1 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6554  0    5 175850 7028   4 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6555  0    5 175850 7028  11 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6556  0    5 175850 7028   8 22:09 pts/1    00:00:00 ./blocking
1622239748.445595 3.000587   6554=tid main() // waking
1622239748.445705 3.000727   6556=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239749.171289 3.726181   6556=tid f(direct, 120000) // i=20000
1622239749.894626 4.449571   6556=tid f(direct, 120000) // i=40000
1622239750.620029 5.175008   6556=tid f(direct, 120000) // i=60000
1622239751.334014 5.888957   6556=tid f(direct, 120000) // i=80000
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6548  6475  6548  0    1  1565  3428  14 22:09 pts/1    00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+  6552  6548  6552  0    5 175850 7028   7 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6553  0    5 175850 7028   1 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6554  0    5 175850 7028   4 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6555  0    5 175850 7028  11 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6556 49    5 175850 7028   8 22:09 pts/1    00:00:02 ./blocking
1622239752.058552 6.613503   6556=tid f(direct, 120000) // i=100000
1622239752.780142 7.335125   6556=tid f(direct, 120000) // i=120000
1622239752.780180 7.335231   6556=tid f(direct, 120000) // leave in 4.334531; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239752.780315 7.335344   6556=tid main() // type in word + hit return
1622239752.799064 [pid  6556] read(0, 7.354051   6552=tid func(going without loop)
1622239752.799373 7.354321   6554=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239753.410394 7.354389   6554=tid f(bar,  80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239753.885670 7.354435   6554=tid f(baz,  60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239754.250720 8.805672   6554=tid f(foo, 100000) // i=20000
UID        PID  PPID   LWP  C NLWP    SZ   RSS PSR STIME TTY          TIME CMD
simon_h+  6548  6475  6548  0    1  1565  3428   8 22:09 pts/1    00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+  6552  6548  6552  0    5 175850 7028   7 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6553  0    5 175850 7028   5 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6554 18    5 175850 7028   4 22:09 pts/1    00:00:01 ./blocking
simon_h+  6552  6548  6555  0    5 175850 7028  11 22:09 pts/1    00:00:00 ./blocking
simon_h+  6552  6548  6556 48    5 175850 7028   8 22:09 pts/1    00:00:04 ./blocking
simon_hardy_francis_dapperlabs_c@benchnet-dev-002:~/20210526-golang-examples$ 1622239754.845221 9.400169   6554=tid f(bar,  80000) // i=20000
1622239755.331816 9.886732   6554=tid f(baz,  60000) // i=20000
1622239755.689428 10.244423   6554=tid f(foo, 100000) // i=40000
1622239756.291424 10.846358   6554=tid f(bar,  80000) // i=40000
1622239756.779627 11.334555   6554=tid f(baz,  60000) // i=40000
1622239757.145514 11.700461   6554=tid f(foo, 100000) // i=60000
1622239757.739752 12.294703   6554=tid f(bar,  80000) // i=60000
1622239758.225962 12.780897   6554=tid f(baz,  60000) // i=60000
1622239758.226009 12.781046   6554=tid f(baz,  60000) // leave in 5.426639; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239758.226070 12.781111   6554=tid f(foo, 100000) // i=80000
1622239758.828281 13.383283   6554=tid f(bar,  80000) // i=80000
1622239758.828323 13.383381   6554=tid f(bar,  80000) // leave in 6.029004; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239758.828331 13.383416   6554=tid f(foo, 100000) // i=100000
1622239758.828372 13.383450   6554=tid f(foo, 100000) // leave in 6.029141; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
1622239760.439469 "w", 1)             = 1
1622239760.439634 [pid  6556] read(0, "o", 1)             = 1
1622239760.439667 [pid  6556] read(0, "r", 1)             = 1
1622239760.439704 [pid  6556] read(0, "d", 1)             = 1
1622239760.439790 [pid  6556] read(0, "!", 1)             = 1
1622239760.439846 [pid  6556] read(0, "\n", 1)            = 1
1622239760.439933 14.994994   6556=tid main() // input=word!
1622239760.439995 14.995058   6556=tid main() // leave in 14.995086; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239760.440053 [pid  6556] exit_group(0)               = ?
1622239760.440111 [pid  6556] +++ exited with 0 +++
1622239760.440121 [pid  6555] +++ exited with 0 +++
1622239760.440131 [pid  6554] +++ exited with 0 +++
1622239760.440340 [pid  6553] +++ exited with 0 +++
1622239760.440354 +++ exited with 0 +++
  • Observation: As expected, strace no longer shows the SIGURG signals used for async preemption.
  • Observation: Even without async preemption then go-routines still appear to context switch. Presumably this happens due to the fmt.Printf()?

[1] https://medium.com/a-journey-with-go/go-asynchronous-preemption-b5194227371c

Comparing function wall-clock times for foo, bar, & baz

# Experiment 1: Run on Linux 16 core box with `GOMAXPROCS=3`
1622239381.240453 8.420593   6269=tid f(baz,  60000) // leave in 1.084364; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=4
1622239382.092124 9.272259   6273=tid f(bar,  80000) // leave in 1.937439; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=3
1622239383.169062 10.349215   6274=tid f(foo, 100000) // leave in 3.014363; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2

# Experiment 2: Run on Linux 16 core box with `GOMAXPROCS=1`
1622239529.398340 10.601869   6361=tid f(baz,  60000) // leave in 3.267334; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239531.101320 12.304834   6361=tid f(bar,  80000) // leave in 4.970311; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239532.152125 13.355641   6361=tid f(foo, 100000) // leave in 6.021169; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2

# Experiment 3: Run on Linux 16 core box with `GOMAXPROCS=1` & `strace`
1622239625.898934 10.702797   6462=tid f(baz,  60000) // leave in 3.320944; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239627.562975 12.366820   6462=tid f(bar,  80000) // leave in 5.000588; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239628.577750 13.381596   6462=tid f(foo, 100000) // leave in 6.036797; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2

# Experiment 4: Run on Linux 16 core box with `GOMAXPROCS=1` & `strace` & `GODEBUG=asyncpreemptoff=1`
1622239758.226009 12.781046   6554=tid f(baz,  60000) // leave in 5.426639; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239758.828323 13.383381   6554=tid f(bar,  80000) // leave in 6.029004; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239758.828372 13.383450   6554=tid f(foo, 100000) // leave in 6.029141; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
  • Observation: Experiments 2 & 3 have the same timings, and are the same experiments except for strace.
  • Observation: Experiement 1 shows the fastest execution time for each go-routine because they each have their own LWP to run on, with on switching to other go-routines or async preemption.
  • Observation: Experiment 1 shows the go-routines running in parallel, but their total time is approx 1 + 2 + 3 = 6 seconds, which is -- as expected -- the time of the longest Go-routine if GOMAXPROCS=1.
  • Observation: Likewise, Experiment 1 is -- as expected -- 3 seconds faster finishing the slowest go-routine due to running the go-routines in parallel on different LWPs.
  • Observation: Experiment 4 shows the most distorted / misleading wall-clock times -- only ~ half a second between the fastest and slowest go-routines -- and is not proportionate to the workload of each go-routine.
  • Observation: Experiments 2 & 3 show misleading wall-clock times but due to async preemption the times are at least proportionate to the workload of each go-routine.
  • Question: The experiments measure the elapsed wall-clock time which ends up different depending upon how much switching goes on between go-routines on a particular LWP. For experiments 2 to 4, how can the code above be modified to record additionally the wall-clock time spent for the particular function excluding time used in other functions (not sure if a facility like this exists for Golang?), e.g. the following desired output if the code changes existed:
# Experiment 3: Run on Linux 16 core box with `GOMAXPROCS=1` & `strace`
1622239625.898934 10.702797   6462=tid f(baz,  60000) // leave in 3.320944, 1.084364 used; ...
1622239627.562975 12.366820   6462=tid f(bar,  80000) // leave in 5.000588, 1.937439 used; 
1622239628.577750 13.381596   6462=tid f(foo, 100000) // leave in 6.036797, 3.014363 used; 

Googling for a solution on how to measure both times

  • Searching Google for "golang measure execution time" only seems to dish up link showing the total wall-clock time including all other go-routines, but not the wall-clock time excluding all other go-routines: [1] [2] [3] [4] [5] [6] [7] [8]
  • Not sure what to Google for... or if a solution exists.

[1] https://stackoverflow.com/questions/45766572/is-there-an-efficient-way-to-calculate-execution-time-in-golang/45766707 [2] https://gist.github.com/denisbrodbeck/6f5813681dfe049b1e5392f978738833 [3] https://flaviocopes.com/golang-measure-time/ [4] https://www.admfactory.com/how-to-measure-execution-time-in-golang/ [5] https://programming.guide/go/measure-execution-time.html [6] https://golangcode.com/function-execution-time/ [7] https://www.socketloop.com/tutorials/golang-measure-execution-time-for-a-function [8] https://medium.com/swlh/easy-guide-to-latency-measurement-in-golang-38c3297ebbd2 [9] https://forum.golangbridge.org/t/measure-execution-time-of-a-fast-operation/16313

Does the Golang tool pprof profiler help? No?

  • Re-run experiment 1 but enable the profiler resulting in pprof.out:
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | USE_PPROF=1 GOMAXPROCS_WANTED=3 ./blocking ; wc --bytes pprof.out
0.000001  23998=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
0.000052  23998=tid main() // enabling pprof
0.000559  23998=tid main() // .GOMAXPROCS(0)=16
0.000607  23998=tid main() // .GOMAXPROCS(0)=3
0.000613  23998=tid main() // sleeping
3.000816  23998=tid main() // waking
3.000859  23998=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
3.713033  23998=tid f(direct, 120000) // i=20000
4.425492  23998=tid f(direct, 120000) // i=40000
5.136554  23998=tid f(direct, 120000) // i=60000
5.847794  23998=tid f(direct, 120000) // i=80000
6.559673  23998=tid f(direct, 120000) // i=100000
7.270959  23998=tid f(direct, 120000) // i=120000
7.271007  23998=tid f(direct, 120000) // leave in 4.270148; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
7.271036  23998=tid main() // type in word + hit return
7.271079  24000=tid f(bar,  80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=6
7.271113  24001=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=6
7.277925  24003=tid func(going without loop)
7.277969  24003=tid f(baz,  60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=5
7.633914  24000=tid f(baz,  60000) // i=20000
7.745805  24000=tid f(bar,  80000) // i=20000
7.864593  24000=tid f(foo, 100000) // i=20000
7.990347  24003=tid f(baz,  60000) // i=40000
8.220914  24003=tid f(bar,  80000) // i=40000
8.346848  24003=tid f(baz,  60000) // i=60000
8.346876  24003=tid f(baz,  60000) // leave in 1.068908; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=5
8.458698  24000=tid f(foo, 100000) // i=40000
8.696054  24000=tid f(bar,  80000) // i=60000
9.052872  24000=tid f(foo, 100000) // i=60000
9.171187  24001=tid f(bar,  80000) // i=80000
9.171234  24001=tid f(bar,  80000) // leave in 1.900155; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=4
9.645728  24000=tid f(foo, 100000) // i=80000
10.238577  24000=tid f(foo, 100000) // i=100000
10.238622  24000=tid f(foo, 100000) // leave in 2.967510; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=3
15.000055  23998=tid main() // input=word!
15.000091  23998=tid main() // leave in 15.000090; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
450 pprof.out

$ go tool pprof -top pprof.out
File: blocking
Type: cpu
Duration: 15s, Total samples = 9.69s (64.60%)
Showing nodes accounting for 9.67s, 99.79% of 9.69s total
Dropped 1 node (cum <= 0.05s)
      flat  flat%   sum%        cum   cum%
     9.67s 99.79% 99.79%      9.69s   100%  main.f
         0     0% 99.79%      4.27s 44.07%  main.main
         0     0% 99.79%      4.27s 44.07%  runtime.main
  • Observation: The flat 9.67s reflects the total accumulated time for function f().

  • Re-run experiment 2 but enable the profiler resulting in pprof.out:

$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | USE_PPROF=1 GOMAXPROCS_WANTED=1 ./blocking ; wc --bytes pprof.out
0.000018  23806=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
0.000220  23806=tid main() // enabling pprof
0.000342  23806=tid main() // .GOMAXPROCS(0)=16
0.000429  23806=tid main() // .GOMAXPROCS(0)=1
0.000437  23806=tid main() // sleeping
3.000628  23808=tid main() // waking
3.000666  23808=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
3.713037  23808=tid f(direct, 120000) // i=20000
4.424750  23808=tid f(direct, 120000) // i=40000
5.137150  23808=tid f(direct, 120000) // i=60000
5.849831  23808=tid f(direct, 120000) // i=80000
6.561112  23808=tid f(direct, 120000) // i=100000
7.273245  23808=tid f(direct, 120000) // i=120000
7.273292  23808=tid f(direct, 120000) // leave in 4.272627; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
7.273310  23808=tid main() // type in word + hit return
7.286434  23809=tid func(going without loop)
7.286485  23809=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
7.301474  23809=tid f(bar,  80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
7.321713  23809=tid f(baz,  60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
8.386264  23809=tid f(baz,  60000) // i=20000
8.706789  23809=tid f(bar,  80000) // i=20000
9.032683  23809=tid f(foo, 100000) // i=20000
9.490639  23809=tid f(baz,  60000) // i=40000
10.091414  23809=tid f(bar,  80000) // i=40000
10.554625  23809=tid f(baz,  60000) // i=60000
10.554671  23809=tid f(baz,  60000) // leave in 3.232958; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
10.708127  23809=tid f(foo, 100000) // i=40000
11.209971  23809=tid f(bar,  80000) // i=60000
11.908678  23809=tid f(foo, 100000) // i=60000
12.170510  23809=tid f(bar,  80000) // i=80000
12.170557  23809=tid f(bar,  80000) // leave in 4.869083; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
12.627328  23809=tid f(foo, 100000) // i=80000
13.220404  23809=tid f(foo, 100000) // i=100000
13.220445  23809=tid f(foo, 100000) // leave in 5.933961; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
14.998897  23808=tid main() // input=word!
14.998929  23808=tid main() // leave in 14.998911; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
590 pprof.out

$ go tool pprof -top pprof.out
File: blocking
Type: cpu
Duration: 15s, Total samples = 10.22s (68.14%)
Showing nodes accounting for 10.19s, 99.71% of 10.22s total
Dropped 5 nodes (cum <= 0.05s)
      flat  flat%   sum%        cum   cum%
    10.19s 99.71% 99.71%     10.20s 99.80%  main.f
         0     0% 99.71%      4.27s 41.78%  main.main
         0     0% 99.71%      4.27s 41.78%  runtime.main
  • Observation: The flat 10.19s reflects the total accumulated time for function f().

  • Observation: The 'flat' seconds reported by pprof excludes delays due to pre-emption, GC, etc.

  • Observation: The 1st & 2nd pprof 'flat' seconds are very similar -- 9.67s vs 10.19s -- and it's not possible to tell that the 2nd pprof run took 3 seconds longer due to go-routine / LWP contention.

Does the Golang tool trace execution tracer help? Maybe...

  • Re-run experiment 2 with the execution trace which resulting in trace.out:
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | USE_TRACE=1 GOMAXPROCS_WANTED=1 ./blocking ; wc --bytes trace.out
0.000020  24170=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
0.000469  24170=tid main() // enabling trace
0.000816  24170=tid main() // .GOMAXPROCS(0)=16
0.000963  24170=tid main() // .GOMAXPROCS(0)=1
0.000976  24170=tid main() // sleeping
3.001165  24173=tid main() // waking
3.001239  24173=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
3.713804  24173=tid f(direct, 120000) // i=20000
4.427349  24173=tid f(direct, 120000) // i=40000
5.139586  24173=tid f(direct, 120000) // i=60000
5.852789  24173=tid f(direct, 120000) // i=80000
6.564084  24173=tid f(direct, 120000) // i=100000
7.277072  24173=tid f(direct, 120000) // i=120000
7.277126  24173=tid f(direct, 120000) // leave in 4.275888; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
7.277199  24173=tid main() // type in word + hit return
7.282649  24170=tid func(going without loop)
7.282725  24174=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=6
7.282777  24174=tid f(bar,  80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=6
7.282788  24174=tid f(baz,  60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=6
8.411256  24174=tid f(baz,  60000) // i=20000
8.711731  24174=tid f(bar,  80000) // i=20000
8.987458  24174=tid f(foo, 100000) // i=20000
9.474273  24174=tid f(baz,  60000) // i=40000
10.176263  24174=tid f(bar,  80000) // i=40000
10.588592  24174=tid f(baz,  60000) // i=60000
10.588650  24174=tid f(baz,  60000) // leave in 3.305863; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
10.667806  24174=tid f(foo, 100000) // i=40000
11.258817  24174=tid f(bar,  80000) // i=60000
11.847173  24174=tid f(foo, 100000) // i=60000
12.198086  24174=tid f(bar,  80000) // i=80000
12.198140  24174=tid f(bar,  80000) // leave in 4.915364; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
12.621021  24174=tid f(foo, 100000) // i=80000
13.214052  24174=tid f(foo, 100000) // i=100000
13.214104  24174=tid f(foo, 100000) // leave in 5.931380; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
15.001921  24173=tid main() // input=word!
15.001955  24173=tid main() // leave in 15.001936; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
8988 trace.out

$ go tool trace trace.out
2021/05/28 15:46:00 Parsing trace...
2021/05/28 15:46:00 Splitting trace...
2021/05/28 15:46:00 Opening browser. Trace viewer is listening on http://127.0.0.1:60944
^C
  • Browser webpage opens. Click on Goroutine analysys, main.f N=3.
  • Text rendering of what the HTML page shows:
Goroutine  Total  Execution  Network  Sync   Blocking  Scheduler  GC         GC
                             wait     block  syscall   wait       sweeping   pause

[]         6017ms 3002ms      0ns      0ns    0ns      3014ms     0ns (0.0%) 0ns (0.0%)
[]         4937ms 1921ms      0ns      0ns    0ns      3016ms     0ns (0.0%) 0ns (0.0%)
[]         3265ms 1087ms      0ns      0ns    0ns      2178ms     0ns (0.0%) 0ns (0.0%)
  • Observation: The good news is that it shows all the timing info we're after :-)
  • Observation: The bad news is that trace.out is about 20 times bigger than pprof.out, and this huge size could be prohibitive for larger Golang programs running for longer?
  • Observation: The other bad news is that it's all graphical to access this info, e.g. [1].
  • Question: How to access the timing info without having to go HTML? Or without having to save the trace file which can get very / too large [2]?

[1] https://about.sourcegraph.com/go/an-introduction-to-go-tool-trace-rhys-hiltner/ [2] golang/go#21870

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment