Skip to content

Instantly share code, notes, and snippets.

@midom
Created July 2, 2020 17:11
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save midom/a9a965a4092e66ed7f6bbc4da41d3672 to your computer and use it in GitHub Desktop.
Save midom/a9a965a4092e66ed7f6bbc4da41d3672 to your computer and use it in GitHub Desktop.
stall detector!
#!/usr/bin/env bcc-py
#
# topwaits Show longest off-cpu waits per-stack
#
# Copyright 2019 Facebook, Inc.
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 13-Jan-2016 Brendan Gregg Wrote offcpu profiler
# 27-Nov-2019 Domas Mituzas Gutted most of profiling part and left stall detector
from __future__ import print_function
from bcc import BPF
from sys import stderr
from time import sleep, strftime
import argparse
import signal
import time
parser = argparse.ArgumentParser(
description="Find longest waiting stacks")
thread_group = parser.add_mutually_exclusive_group()
thread_group.add_argument("-p", "--pid", metavar="PID", dest="tgid",
help="trace this PID only", type=int)
thread_group.add_argument("-t", "--tid", metavar="TID", dest="pid",
help="trace this TID only", type=int)
parser.add_argument("--stack-storage-size", default=65536,
type=int,
help="the number of unique stack traces that can be stored and "
"displayed (default 65536)")
parser.add_argument("duration", nargs="?", default=99999999,
type=int,
help="duration of trace, in seconds")
parser.add_argument("-m", "--min-block-time", default=1,
type=int,
help="the amount of time in microseconds over which we " +
"store traces (default 1)")
parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1,
type=int,
help="the amount of time in microseconds under which we " +
"store traces (default U64_MAX)")
parser.add_argument("--state", type=int,
help="filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE" +
") see include/linux/sched.h")
args = parser.parse_args()
if args.pid and args.tgid:
parser.error("specify only one of -p and -t")
duration = int(args.duration)
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#define MINBLOCK_US MINBLOCK_US_VALUEULL
#define MAXBLOCK_US MAXBLOCK_US_VALUEULL
struct key_t {
u32 pid;
int user_stack_id;
int kernel_stack_id;
};
BPF_HASH(counts, struct key_t);
BPF_HASH(maximums, struct key_t);
BPF_HASH(timestamps, struct key_t);
BPF_HASH(start, u32);
BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
int oncpu(struct pt_regs *ctx, struct task_struct *prev) {
u32 pid = prev->pid;
u32 tgid = prev->tgid;
u64 ts, *tsp, *valp;
// record previous thread sleep time
ts = bpf_ktime_get_ns();
if ((THREAD_FILTER) && (STATE_FILTER)) {
start.update(&pid, &ts);
}
// get the current thread's start time
pid = bpf_get_current_pid_tgid();
tgid = bpf_get_current_pid_tgid() >> 32;
tsp = start.lookup(&pid);
if (tsp == 0) {
return 0; // missed start or filtered
}
// calculate current thread's delta time in nanos
u64 delta = ts - *tsp;
start.delete(&pid);
// nanos to micros
delta = delta / 1000;
if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US)) {
return 0;
}
// create map key
struct key_t key = {};
key.pid = tgid;
key.user_stack_id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK);
key.kernel_stack_id = stack_traces.get_stackid(ctx, 0);
counts.increment(key, delta);
valp = maximums.lookup(&key);
if (valp == 0 || *valp < delta) {
maximums.update(&key, &delta);
timestamps.update(&key, &ts);
}
return 0;
}
"""
thread_filter = '1'
# set thread filter
if args.tgid is not None:
thread_filter = 'tgid == %d' % args.tgid
elif args.pid is not None:
thread_filter = 'pid == %d' % args.pid
if args.state == 0:
state_filter = 'prev->state == 0'
elif args.state:
# these states are sometimes bitmask checked
state_filter = 'prev->state & %d' % args.state
else:
state_filter = '1'
bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
bpf_text = bpf_text.replace('STATE_FILTER', state_filter)
# set stack storage size
bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time))
bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time))
# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event="finish_task_switch", fn_name="oncpu")
matched = b.num_open_kprobes()
if matched == 0:
print("error: 0 functions traced. Exiting.", file=stderr)
exit(1)
try:
sleep(duration)
except KeyboardInterrupt:
# as cleanup can take many seconds, trap Ctrl-C:
pass
signal.signal(signal.SIGINT, signal.SIG_IGN)
counts = b.get_table("counts")
stack_traces = b.get_table("stack_traces")
maximums = b.get_table("maximums")
timestamps = b.get_table("timestamps")
basetime = time.time() - float(open("/proc/uptime").read().split()[0])
def memoize(f):
class memodict(dict):
def __init__(self, f):
self.f = f
def __call__(self, *args):
return self[args]
def __missing__(self, key):
ret = self[key] = self.f(*key)
return ret
return memodict(f)
@memoize
def get_sym(addr, pid):
val = b.sym(addr, pid)
return val.decode().split("(")[0]
for k, v in counts.items():
user_stack = tuple([] if k.user_stack_id < 0 else \
[get_sym(x, k.pid) for x in stack_traces.walk(k.user_stack_id)])
kernel_stack = tuple([] if k.kernel_stack_id < 0 else \
[b.ksym(x) for x in stack_traces.walk(k.kernel_stack_id)])
ts = timestamps[k].value / 1000000000 + basetime
print(v.value, maximums[k].value, ts, kernel_stack, user_stack)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment