stall detector!
#!/usr/bin/env bcc-py | |
# | |
# topwaits Show longest off-cpu waits per-stack | |
# | |
# Copyright 2019 Facebook, Inc. | |
# Copyright 2016 Netflix, Inc. | |
# Licensed under the Apache License, Version 2.0 (the "License") | |
# | |
# 13-Jan-2016 Brendan Gregg Wrote offcpu profiler | |
# 27-Nov-2019 Domas Mituzas Gutted most of profiling part and left stall detector | |
from __future__ import print_function | |
from bcc import BPF | |
from sys import stderr | |
from time import sleep, strftime | |
import argparse | |
import signal | |
import time | |
parser = argparse.ArgumentParser( | |
description="Find longest waiting stacks") | |
thread_group = parser.add_mutually_exclusive_group() | |
thread_group.add_argument("-p", "--pid", metavar="PID", dest="tgid", | |
help="trace this PID only", type=int) | |
thread_group.add_argument("-t", "--tid", metavar="TID", dest="pid", | |
help="trace this TID only", type=int) | |
parser.add_argument("--stack-storage-size", default=65536, | |
type=int, | |
help="the number of unique stack traces that can be stored and " | |
"displayed (default 65536)") | |
parser.add_argument("duration", nargs="?", default=99999999, | |
type=int, | |
help="duration of trace, in seconds") | |
parser.add_argument("-m", "--min-block-time", default=1, | |
type=int, | |
help="the amount of time in microseconds over which we " + | |
"store traces (default 1)") | |
parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1, | |
type=int, | |
help="the amount of time in microseconds under which we " + | |
"store traces (default U64_MAX)") | |
parser.add_argument("--state", type=int, | |
help="filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE" + | |
") see include/linux/sched.h") | |
args = parser.parse_args() | |
if args.pid and args.tgid: | |
parser.error("specify only one of -p and -t") | |
duration = int(args.duration) | |
# define BPF program | |
bpf_text = """ | |
#include <uapi/linux/ptrace.h> | |
#include <linux/sched.h> | |
#define MINBLOCK_US MINBLOCK_US_VALUEULL | |
#define MAXBLOCK_US MAXBLOCK_US_VALUEULL | |
struct key_t { | |
u32 pid; | |
int user_stack_id; | |
int kernel_stack_id; | |
}; | |
BPF_HASH(counts, struct key_t); | |
BPF_HASH(maximums, struct key_t); | |
BPF_HASH(timestamps, struct key_t); | |
BPF_HASH(start, u32); | |
BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); | |
int oncpu(struct pt_regs *ctx, struct task_struct *prev) { | |
u32 pid = prev->pid; | |
u32 tgid = prev->tgid; | |
u64 ts, *tsp, *valp; | |
// record previous thread sleep time | |
ts = bpf_ktime_get_ns(); | |
if ((THREAD_FILTER) && (STATE_FILTER)) { | |
start.update(&pid, &ts); | |
} | |
// get the current thread's start time | |
pid = bpf_get_current_pid_tgid(); | |
tgid = bpf_get_current_pid_tgid() >> 32; | |
tsp = start.lookup(&pid); | |
if (tsp == 0) { | |
return 0; // missed start or filtered | |
} | |
// calculate current thread's delta time in nanos | |
u64 delta = ts - *tsp; | |
start.delete(&pid); | |
// nanos to micros | |
delta = delta / 1000; | |
if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US)) { | |
return 0; | |
} | |
// create map key | |
struct key_t key = {}; | |
key.pid = tgid; | |
key.user_stack_id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK); | |
key.kernel_stack_id = stack_traces.get_stackid(ctx, 0); | |
counts.increment(key, delta); | |
valp = maximums.lookup(&key); | |
if (valp == 0 || *valp < delta) { | |
maximums.update(&key, &delta); | |
timestamps.update(&key, &ts); | |
} | |
return 0; | |
} | |
""" | |
thread_filter = '1' | |
# set thread filter | |
if args.tgid is not None: | |
thread_filter = 'tgid == %d' % args.tgid | |
elif args.pid is not None: | |
thread_filter = 'pid == %d' % args.pid | |
if args.state == 0: | |
state_filter = 'prev->state == 0' | |
elif args.state: | |
# these states are sometimes bitmask checked | |
state_filter = 'prev->state & %d' % args.state | |
else: | |
state_filter = '1' | |
bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) | |
bpf_text = bpf_text.replace('STATE_FILTER', state_filter) | |
# set stack storage size | |
bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) | |
bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time)) | |
bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time)) | |
# initialize BPF | |
b = BPF(text=bpf_text) | |
b.attach_kprobe(event="finish_task_switch", fn_name="oncpu") | |
matched = b.num_open_kprobes() | |
if matched == 0: | |
print("error: 0 functions traced. Exiting.", file=stderr) | |
exit(1) | |
try: | |
sleep(duration) | |
except KeyboardInterrupt: | |
# as cleanup can take many seconds, trap Ctrl-C: | |
pass | |
signal.signal(signal.SIGINT, signal.SIG_IGN) | |
counts = b.get_table("counts") | |
stack_traces = b.get_table("stack_traces") | |
maximums = b.get_table("maximums") | |
timestamps = b.get_table("timestamps") | |
basetime = time.time() - float(open("/proc/uptime").read().split()[0]) | |
def memoize(f): | |
class memodict(dict): | |
def __init__(self, f): | |
self.f = f | |
def __call__(self, *args): | |
return self[args] | |
def __missing__(self, key): | |
ret = self[key] = self.f(*key) | |
return ret | |
return memodict(f) | |
@memoize | |
def get_sym(addr, pid): | |
val = b.sym(addr, pid) | |
return val.decode().split("(")[0] | |
for k, v in counts.items(): | |
user_stack = tuple([] if k.user_stack_id < 0 else \ | |
[get_sym(x, k.pid) for x in stack_traces.walk(k.user_stack_id)]) | |
kernel_stack = tuple([] if k.kernel_stack_id < 0 else \ | |
[b.ksym(x) for x in stack_traces.walk(k.kernel_stack_id)]) | |
ts = timestamps[k].value / 1000000000 + basetime | |
print(v.value, maximums[k].value, ts, kernel_stack, user_stack) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment