Last active
May 26, 2023 09:55
-
-
Save brendangregg/c67039252268ec5e66ba to your computer and use it in GitHub Desktop.
chaintest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# | |
# chaintest Summarize off-CPU time by kernel stack + 2 waker stacks | |
# WORK IN PROGRESS. For Linux, uses BCC, eBPF. | |
# | |
# USAGE: chaintest [-h] [-u] [-p PID] [-i INTERVAL] [-T] [duration] | |
# | |
# PLEASE DO NOT RUN THIS IN PRODUCTION! This is a work in progress, intended to | |
# explore chain graphs on Linux, using eBPF capabilities from a particular | |
# kernel version (4.3ish). This tool will eventually get much better. | |
# | |
# The current implementation uses an unrolled loop for x86_64, and was written | |
# as a proof of concept. This implementation should be replaced in the future | |
# with an appropriate bpf_ call, when available. | |
# | |
# Off-cpu stack currently limited to a stack trace depth of 18 (maxtdepth), | |
# and waker stacks limited to 7. This is working around BPF_STACK_MAX (512). | |
# | |
# Copyright 2016 Brendan Gregg. | |
# Licensed under the Apache License, Version 2.0 (the "License") | |
# | |
# 25-Jan-2016 Brendan Gregg Created this. | |
from __future__ import print_function | |
from bcc import BPF | |
from time import sleep | |
import argparse | |
import signal | |
# arguments | |
examples = """examples: | |
./chaintest # trace off-CPU + waker stack time until Ctrl-C | |
./chaintest 5 # trace for 5 seconds only | |
./chaintest -f 5 # 5 seconds, and output in folded format | |
./chaintest -u # don't include kernel threads (user only) | |
./chaintest -p 185 # trace fo PID 185 only | |
""" | |
parser = argparse.ArgumentParser( | |
description="Summarize off-CPU time by kernel stack + 2 waker stacks", | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog=examples) | |
parser.add_argument("-u", "--useronly", action="store_true", | |
help="user threads only (no kernel threads)") | |
parser.add_argument("-p", "--pid", | |
help="trace this PID only") | |
parser.add_argument("-v", "--verbose", action="store_true", | |
help="show raw addresses") | |
parser.add_argument("-f", "--folded", action="store_true", | |
help="output folded format") | |
parser.add_argument("duration", nargs="?", default=99999999, | |
help="duration of trace, in seconds") | |
args = parser.parse_args() | |
folded = args.folded | |
duration = int(args.duration) | |
debug = 0 | |
maxwdepth = 7 # and MAXWDEPTH | |
maxtdepth = 18 # and MAXTDEPTH | |
if args.pid and args.useronly: | |
print("ERROR: use either -p or -u.") | |
exit() | |
# signal handler | |
def signal_ignore(signal, frame): | |
print() | |
# define BPF program | |
bpf_text = """ | |
#include <uapi/linux/ptrace.h> | |
#include <linux/sched.h> | |
// carefully work around MAX_BPF_STACK limit of 512: | |
#define MAXWDEPTH 7 | |
#define MAXTDEPTH 18 | |
#define MINBLOCK_US 1 | |
struct key_t { | |
char waker0[TASK_COMM_LEN]; | |
char waker1[TASK_COMM_LEN]; | |
char target[TASK_COMM_LEN]; | |
u64 wret0[MAXWDEPTH]; | |
u64 wret1[MAXWDEPTH]; | |
u64 tret[MAXTDEPTH]; | |
}; | |
BPF_HASH(counts, struct key_t); | |
BPF_HASH(start, u32); | |
struct wokeby_t { | |
char name0[TASK_COMM_LEN]; | |
char name1[TASK_COMM_LEN]; | |
u64 ret0[MAXWDEPTH]; | |
u64 ret1[MAXWDEPTH]; | |
}; | |
BPF_HASH(wokeby, u32, struct wokeby_t); | |
static u64 get_frame(u64 *bp) { | |
if (*bp) { | |
// The following stack walker is x86_64 specific | |
u64 ret = 0; | |
if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8))) | |
return 0; | |
if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp)) | |
*bp = 0; | |
if (ret < __START_KERNEL_map) | |
return 0; | |
return ret; | |
} | |
return 0; | |
} | |
int waker(struct pt_regs *ctx, struct task_struct *p) { | |
u32 mypid, pid = p->pid; | |
if (!(FILTER)) | |
return 0; | |
u64 bp = 0; | |
struct wokeby_t woke = {}, *mywokep; | |
bpf_get_current_comm(&woke.name0, sizeof(woke.name0)); | |
bp = ctx->bp; | |
// unrolled loop (MAXWDEPTH): | |
int depth = 0; // clang nukes depth | |
if (!(woke.ret0[depth++] = get_frame(&bp))) goto out; | |
if (!(woke.ret0[depth++] = get_frame(&bp))) goto out; | |
if (!(woke.ret0[depth++] = get_frame(&bp))) goto out; | |
if (!(woke.ret0[depth++] = get_frame(&bp))) goto out; | |
if (!(woke.ret0[depth++] = get_frame(&bp))) goto out; | |
if (!(woke.ret0[depth++] = get_frame(&bp))) goto out; | |
woke.ret0[depth++] = get_frame(&bp); | |
out: | |
mypid = bpf_get_current_pid_tgid(); | |
mywokep = wokeby.lookup(&mypid); | |
if (mywokep) { | |
// copy waker stack and process name: | |
__builtin_memcpy(&woke.ret1, mywokep->ret0, sizeof(woke.ret1)); | |
__builtin_memcpy(&woke.name1, mywokep->name0, TASK_COMM_LEN); | |
// XXX: can't delete yet | |
// wokeby.delete(&pid); | |
} | |
wokeby.update(&pid, &woke); | |
return 0; | |
} | |
int oncpu(struct pt_regs *ctx, struct task_struct *p) { | |
u32 pid; | |
u64 ts, *tsp; | |
// record previous thread sleep time | |
if (FILTER) { | |
pid = p->pid; | |
ts = bpf_ktime_get_ns(); | |
start.update(&pid, &ts); | |
} | |
// calculate current thread's delta time | |
pid = bpf_get_current_pid_tgid(); | |
tsp = start.lookup(&pid); | |
if (tsp == 0) | |
return 0; // missed start or filtered | |
u64 delta = bpf_ktime_get_ns() - *tsp; | |
start.delete(&pid); | |
delta = delta / 1000; | |
if (delta < MINBLOCK_US) | |
return 0; | |
// create map key | |
u64 zero = 0, *val, bp = 0; | |
struct key_t key = {}; | |
struct wokeby_t *woke; | |
bpf_get_current_comm(&key.target, sizeof(key.target)); | |
bp = ctx->bp; | |
// unrolled loop (MAXTDEPTH): | |
int depth = 0; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; // X | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
if (!(key.tret[depth++] = get_frame(&bp))) goto out; | |
key.tret[depth++] = get_frame(&bp); | |
out: | |
woke = wokeby.lookup(&pid); | |
if (woke) { | |
// copy 1st waker stack and process name: | |
__builtin_memcpy(&key.wret0, woke->ret0, sizeof(key.wret0)); | |
__builtin_memcpy(&key.waker0, woke->name0, TASK_COMM_LEN); | |
// copy 2nd waker stack and process name: | |
__builtin_memcpy(&key.wret1, woke->ret1, sizeof(key.wret1)); | |
__builtin_memcpy(&key.waker1, woke->name1, TASK_COMM_LEN); | |
// XXX: can't delete yet wokeby.delete(&pid); | |
} | |
val = counts.lookup_or_init(&key, &zero); | |
(*val) += delta; | |
return 0; | |
} | |
""" | |
if args.pid: | |
filter = 'pid == %s' % args.pid | |
elif args.useronly: | |
filter = '!(p->flags & PF_KTHREAD)' | |
else: | |
filter = '1' | |
bpf_text = bpf_text.replace('FILTER', filter) | |
if debug: | |
print(bpf_text) | |
# initialize BPF | |
b = BPF(text=bpf_text) | |
b.attach_kprobe(event="finish_task_switch", fn_name="oncpu") | |
b.attach_kprobe(event="try_to_wake_up", fn_name="waker") | |
matched = b.num_open_kprobes() | |
if matched == 0: | |
print("0 functions traced. Exiting.") | |
exit() | |
# header | |
if not folded: | |
print("Tracing off-CPU time (us) by kernel stack", end="") | |
if duration < 99999999: | |
print(" for %d secs." % duration) | |
else: | |
print("... Hit Ctrl-C to end.") | |
# output | |
while (1): | |
try: | |
sleep(duration) | |
except KeyboardInterrupt: | |
# as cleanup can take many seconds, trap Ctrl-C: | |
signal.signal(signal.SIGINT, signal_ignore) | |
if not folded: | |
print() | |
counts = b.get_table("counts") | |
for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): | |
if folded: | |
# fold target stack | |
line = k.target + ";" | |
for i in reversed(range(0, maxtdepth)): | |
if k.tret[i] == 0: | |
continue | |
line = line + b.ksym(k.tret[i]) | |
if i != 0: | |
line = line + ";" | |
# add delimiter | |
line = line + ";-" | |
# fold waker stack | |
for i in range(0, maxwdepth): | |
line = line + ";" | |
if k.wret0[i] == 0: | |
break; | |
line = line + b.ksym(k.wret0[i]) | |
if i != 0: | |
line = line + ";" + k.waker0 | |
# add delimiter | |
line = line + ";-" | |
# fold waker stack | |
for i in range(0, maxwdepth): | |
line = line + ";" | |
if k.wret1[i] == 0: | |
break; | |
line = line + b.ksym(k.wret1[i]) | |
if i != 0: | |
line = line + ";" + k.waker1 | |
# print as a line | |
print("%s %d" % (line, v.value)) | |
else: | |
# print wakeup name then stack in reverse order | |
print(" %-16s %s" % ("waker1:", k.waker1)) | |
for i in reversed(range(0, maxwdepth)): | |
if k.wret1[i] == 0: | |
continue; | |
print(" %-16x %s" % (k.wret1[i], | |
b.ksym(k.wret1[i]))) | |
# print delimiter | |
print(" %-16s %s" % ("-", "-")) | |
# print wakeup name then stack in reverse order | |
print(" %-16s %s" % ("waker0:", k.waker0)) | |
for i in reversed(range(0, maxwdepth)): | |
if k.wret0[i] == 0: | |
continue; | |
print(" %-16x %s" % (k.wret0[i], | |
b.ksym(k.wret0[i]))) | |
# print delimiter | |
print(" %-16s %s" % ("-", "-")) | |
# print default multi-line stack output | |
for i in range(0, maxtdepth): | |
if k.tret[i] == 0: | |
break | |
print(" %-16x %s" % (k.tret[i], | |
b.ksym(k.tret[i]))) | |
print(" %-16s %s" % ("target:", k.target)) | |
print(" %d\n" % v.value) | |
counts.clear() | |
if not folded: | |
print("Detaching...") | |
exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment