Created
March 11, 2025 16:40
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
from bcc import BPF | |
import argparse | |
import time | |
import datetime | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-t', '--thresh-ms', type=float, default=100.0, | |
help='runtime threshold in ms') | |
parser.add_argument('-b', '--nr_backtraces', type=int, default=64, | |
help='number of backtraces to capture') | |
parser.add_argument('-i', '--backtrace-interval-ms', type=float, default=10.0, | |
help='backtrace capture interval in ms') | |
parser.add_argument('-k', '--kthread-only', action='store_true', | |
help='only consider kernel threads') | |
parser.add_argument('-p', '--percpu-only', action='store_true', | |
help='only consider percpu threads') | |
args = parser.parse_args() | |
NR_BTS = 64 | |
NR_STACKS = 32768 | |
NR_RESULTS = 128 | |
bpf_source = """ | |
#include <linux/sched.h> | |
BPF_STACK_TRACE(stacks, __NR_STACKS__); | |
struct running_task { | |
u64 running_at; | |
u64 bt_at; | |
u32 bt[__NR_BTS__]; | |
u32 bt_seq; | |
u32 seq; | |
u32 pid; | |
char comm[TASK_COMM_LEN]; | |
u64 ran_for; | |
}; | |
BPF_PERCPU_ARRAY(running_task, struct running_task, 1); | |
BPF_ARRAY(results, struct running_task, __NR_RESULTS__); | |
BPF_ARRAY(result_seq, unsigned long, 1); | |
RAW_TRACEPOINT_PROBE(sched_switch) | |
{ | |
// TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) | |
struct task_struct *prev = (void *)ctx->args[1]; | |
struct task_struct *next = (void *)ctx->args[2]; | |
s32 cpu = bpf_get_smp_processor_id(); | |
u64 now = bpf_ktime_get_ns(); | |
struct running_task *t; | |
if (!(t = running_task.lookup(&cpu))) | |
return 0; | |
if (t->running_at && prev->pid) { | |
s64 dur = now - t->running_at; | |
if (dur > __RUNTIME_THRESH_NS__) { | |
u32 zero = 0, idx; | |
unsigned long *rseq, seq; | |
struct running_task *res; | |
if (!(rseq = result_seq.lookup(&zero))) | |
return 0; | |
seq = __sync_fetch_and_add(rseq, 1) + 1; | |
idx = seq % __NR_RESULTS__; | |
if (!(res = results.lookup(&idx))) | |
return 0; | |
__builtin_memcpy(res, t, sizeof(*res)); | |
res->pid = prev->pid; | |
bpf_probe_read_kernel(res->comm, TASK_COMM_LEN, prev->comm); | |
res->ran_for = dur; | |
res->seq = seq; | |
bpf_trace_printk("%s[%d] ran for %lluus", prev->comm, prev->pid, (now - t->running_at) / 1000); | |
} | |
} | |
t->running_at = 0; | |
t->bt_at = 0; | |
if (__KTHREAD_ONLY__ && !(next->flags & PF_KTHREAD)) | |
return 0; | |
if (__PERCPU_ONLY__ && next->nr_cpus_allowed != 1) | |
return 0; | |
t->running_at = now; | |
t->bt_at = now; | |
t->bt_seq = 0; | |
return 0; | |
} | |
void kprobe__sched_tick(struct pt_regs *ctx) | |
{ | |
s32 cpu = bpf_get_smp_processor_id(); | |
u64 now = bpf_ktime_get_ns(); | |
struct running_task *t; | |
u32 stkid, idx; | |
if (!(t = running_task.lookup(&cpu))) | |
return; | |
if (!t->bt_at || now - t->bt_at < __BACKTRACE_INTERVAL_NS__) | |
return; | |
idx = t->bt_seq++ % __NR_BTS__; | |
t->bt[idx] = stacks.get_stackid(ctx, BPF_F_REUSE_STACKID); | |
t->bt_at = now; | |
} | |
""" | |
bpf_source = bpf_source.replace('__NR_STACKS__', f'{NR_STACKS}') | |
bpf_source = bpf_source.replace('__NR_RESULTS__', f'{NR_RESULTS}') | |
bpf_source = bpf_source.replace('__RUNTIME_THRESH_NS__', f'{int(args.thresh_ms * 1000000)}') | |
bpf_source = bpf_source.replace('__NR_BTS__', f'{args.nr_backtraces}') | |
bpf_source = bpf_source.replace('__BACKTRACE_INTERVAL_NS__', f'{int(args.backtrace_interval_ms * 1000000)}') | |
bpf_source = bpf_source.replace('__KTHREAD_ONLY__', f'{int(args.kthread_only)}') | |
bpf_source = bpf_source.replace('__PERCPU_ONLY__', f'{int(args.percpu_only)}') | |
bpf = BPF(text=bpf_source) | |
stacks = bpf["stacks"] | |
results = bpf['results'] | |
next_seq = 1 | |
def print_stack(stkid): | |
for addr in stacks.walk(stkid): | |
sym = bpf.ksym(addr).decode('utf-8') | |
print(' {}'.format(sym)) | |
while True: | |
time.sleep(0.1); | |
now = time.time() | |
now_str = datetime.datetime.fromtimestamp(now).strftime('%Y%m%d-%H:%M:%S') | |
while True: | |
idx = next_seq % NR_RESULTS | |
r = results[idx] | |
if r.seq < next_seq: | |
break | |
print(f'{r.comm.decode("utf-8")}[{r.pid}] ran_for={r.ran_for/1000000:.2f}ms bt_seq={r.bt_seq}') | |
bt_start = max(r.bt_seq - args.nr_backtraces, 0) | |
for bti in range(bt_start, r.bt_seq): | |
stkid = r.bt[bti % args.nr_backtraces] | |
print() | |
print_stack(stkid) | |
print() | |
next_seq = r.seq + 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment