cpuunclaimed.py draft
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# @lint-avoid-python-3-compatibility-imports | |
# | |
# cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU. | |
# For Linux, uses BCC, eBPF. | |
# | |
# This samples the length of the run queues and determine when there are idle | |
# CPUs, yet queued threads waiting their turn. Report the amount of idle | |
# (yet unclaimed by waiting threads) CPU as a system-wide percentage. | |
# | |
# This situation can happen for a number of reasons: | |
# - An application has been bound to some, but not all, CPUs, and has runnable | |
# threads that cannot migrate to other CPUs due to this configuration. | |
# - CPU affinity: an optimization that leaves threads on CPUs where the CPU | |
# caches are warm, even if this means short periods of waiting while other | |
# CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*). | |
# - Scheduler bugs. | |
# | |
# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a | |
# cause for concern. By leaving the CPU idle, overall throughput of the system | |
# may be improved. This tool is best for identifying larger issues, > 2%, due | |
# to the coarseness of its 99 Hertz samples. | |
# | |
# This is an experimental tool that currently works by use of sampling to | |
# keep overheads low, and also binding of this program to CPU 0. See the code | |
# comments. If this identifies unclaimed CPU, you should double check it by | |
# tracing scheduler events (although that approach has much higher overhead). | |
# Check for future versions of this tool which may have improvements. | |
# | |
# USAGE: cpuunclaimed [-h] [-T] [interval] [count] | |
# | |
# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is | |
# a version of this tool that may work on Linux 4.6 - 4.8. | |
# | |
# Copyright 2016 Netflix, Inc. | |
# Licensed under the Apache License, Version 2.0 (the "License") | |
# | |
# 14-Dec-2016 Brendan Gregg Created this. | |
from __future__ import print_function | |
from bcc import BPF, PerfType, PerfSWConfig | |
from time import sleep, strftime | |
from ctypes import c_int | |
import argparse | |
import multiprocessing | |
from os import getpid, system | |
# arguments | |
examples = """examples: | |
./cpuunclaimed # summarize run queue length as a histogram | |
./cpuunclaimed 1 10 # print 1 second summaries, 10 times | |
./cpuunclaimed -T 1 # 1s summaries and timestamps | |
""" | |
parser = argparse.ArgumentParser( | |
description="Summarize scheduler run queue length as a histogram", | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog=examples) | |
parser.add_argument("-T", "--timestamp", action="store_true", | |
help="include timestamp on output") | |
parser.add_argument("interval", nargs="?", default=99999999, | |
help="output interval, in seconds") | |
parser.add_argument("count", nargs="?", default=99999999, | |
help="number of outputs") | |
args = parser.parse_args() | |
countdown = int(args.count) | |
frequency = 99 * 2 # 2 phases: recording, reporting | |
dobind = 1 | |
debug = 0 | |
# define BPF program | |
bpf_text = """ | |
#include <uapi/linux/ptrace.h> | |
#include <linux/sched.h> | |
enum global_types { | |
G_RUNNING = 1, | |
G_QUEUE, | |
G_TS, | |
G_MAXGLOBAL | |
}; | |
enum stat_types { | |
S_POSITIVE = 1, | |
S_RUNNING, | |
S_IDLE, | |
S_MAXSTAT | |
}; | |
BPF_TABLE("array", int, u64, globals, G_MAXGLOBAL); | |
BPF_TABLE("array", int, u64, stats, S_MAXSTAT); | |
// Declare enough of cfs_rq to find nr_running, since we can't #import the | |
// header. This will need maintenance. It is from kernel/sched/sched.h: | |
struct cfs_rq_partial { | |
struct load_weight load; | |
unsigned int nr_running, h_nr_running; | |
}; | |
int do_perf_event() | |
{ | |
int cpu = bpf_get_smp_processor_id(); | |
u64 now = bpf_ktime_get_ns(); | |
u64 *val = NULL; | |
/* | |
* Big Theory Statement | |
* | |
* This program determines if there are idle CPUs while others have | |
* tasks waiting on their run queue. It updates summary counters that | |
* are periodically read (every second) by user space. This is measured on | |
* timed samples (eg 99 Hertz) so that, while coarse, this is a low | |
* overhead approach, as the maximum rate of events is 99 x CPU_count, | |
* per second. Tracing scheduler events (queueing or context switching | |
* events) can have a rate of millions per second, and the overhead of this | |
* can become undesirable for production use. | |
* | |
* Constraints: | |
* | |
* A) The state of all CPUs must be compared at the same point in time. | |
* B) Without using loops. | |
* C) Samples may not fire during idle (due to no_hz when idle). | |
* D) Samples may execute in any order. | |
* E) Samples may execute at the same time. | |
* | |
* The samples usually arrive in CPU sequence order, from 0 to max CPU, as | |
* the samples are initialized in CPU order from a Python loop (in | |
* attach_perf_event()), which introduces a small delay from when one CPU | |
* timer is initialized to the next. However, higher priority interrupts | |
* can delay when a sampling interrupt fires, so that under heavy load the | |
* samples can begin to arrive in any order. A delayed sample causes the | |
* next sample to arrive earlier (if possible), to return to the expected | |
* time offset. | |
* | |
* (I've been tempted to stretch the delay between initializing sampling | |
* timers to accomodate such delays, which would solve (D) and (E) and | |
* simplify the program, but would break (A)). | |
* | |
* It is assumed that there is no drift between the CPU timers, such that | |
* they always fire around the same expected time. This matches my | |
* observation on one test system. If there is a system where this | |
* assumption is not true, this sampling approach will not work once the | |
* drift becomes significant (perhaps the tool would work for short | |
* durations only). | |
* | |
* The following implementation is experimental, and currently does not | |
* solve (C) completely. It works by alternating between two phases, | |
* a recording phase that increments globals counters with run queue | |
* state, and then a reporting phase that calculates queued when idle | |
* and emits counters for user space. States alternate based on a stored | |
* timestamp. It works like this: | |
* | |
* 1. Identify the first sample to arrive in the reporting cycle by | |
* looking for a jump in the saved timestamp by over 1.8 x sample | |
* period. | |
* 1.1. Implement a tie-breaker so that, if multiple samples arrive | |
* at the exact same time, only one continues (see XXX). | |
* 1.2. Set saved timestamp to now. See (2.). | |
* 1.3. Read previous stored counters for running CPUs and queued threads, | |
* and calculate the number of threads that could have run. | |
* 1.4. Emit values for user space. | |
* 1.5. Return. | |
* 2. Identify remaining threads in the reporting cycle as having a time | |
* jump of less than 0.8 x sample period, and return. | |
* 3. Remaining threads are all now in the recording cycle (time is between | |
* 0.8 and 1.8 sample periods from last report cycle). | |
* 3.1. Fetch the CPU run queue length. | |
* 3.2. If length is > 0, increment running global statistic by 1. | |
* 3.3. If length is > 1, increment queued global statistic by the number | |
* of waiting threads. | |
* | |
* You are welcome to try to do better. Suggested workloads to test: | |
* | |
* - idle system # for testing constraint (C) | |
* - taskset -c 1 multi-threaded-CPU-bound-program | |
* - cd linux; make -j#CPUs | |
* - cd linux; make -j#CPUs_x_8 # overload, for testing (D) and (E) | |
* | |
* With some bpf_trace_printk()s, you can inspect trace_pipe and see how | |
* the logic is behaving, in particular, for missing events (C), | |
* out-of-order events (D), and events at the same time (E). | |
* | |
* Perhaps we'll figure out a better tie-breaker routine (see XXX), and | |
* this program will be improved. Perhaps there's a different approach, | |
* one that doesn't need two phases, and can record data on every sample. | |
* Or perhaps this BPF program should simply dump timestamped run queue | |
* lengths to user-space, and we'll do it all there, paying the extra | |
* overhead. | |
*/ | |
// cycles are keyed by the elapsed time since a stored timestamp | |
int idx = G_TS; | |
u64 *ts = globals.lookup(&idx); | |
if (ts == NULL) | |
return 0; // sholudn't happen | |
u64 old = *ts; | |
if (old == 0) { | |
// first hit | |
*ts = now; | |
return 0; | |
} else { | |
if (now > (old + TRIGGER2)) { | |
/* | |
* Report cycle | |
*/ | |
/* | |
* Tie-breaker: only one thread should carry on from here. | |
* XXX: We don't have __sync_val_compare_and_swap() or other | |
* atomics, so I've yet to figure out a reliable way to do this. | |
* As a temporary workaround, only allow CPU 0, although this | |
* breaks constraint (C). As a small help to prevent CPU 0 from | |
* powering down, this Python program is bound to CPU 0. See the | |
* other XXX note. | |
*/ | |
if (cpu != 0) | |
return 0; | |
/* | |
* Setup timestamp for the next cycle. Several other CPU may be | |
* accessing *ts around the same time we change it here. What | |
* happens is they either read the: | |
* | |
* - old time: it's up to the "Tie-breaker" logic above to only | |
* allow one CPU to carry on and do the report cycle. | |
* - new time: the TRIGGER1 test will identify this state. | |
* | |
* Both paths lead to the other CPUs doing nothing on this cycle, | |
* like we want. Everyone is then setup correctly for the record | |
* cycle on the next sample. | |
*/ | |
*ts = now; | |
// fetch, cache, then reset globals. | |
idx = G_RUNNING; u64 *runp = globals.lookup(&idx); | |
idx = G_QUEUE; u64 *queuep = globals.lookup(&idx); | |
if (runp == NULL || queuep == NULL) | |
return 0; // shouldn't happen | |
u64 run = *runp; | |
u64 queue = *queuep; | |
(*runp) = 0; | |
(*queuep) = 0; | |
/* | |
* Infer idle CPU count as the CPU count minus running. We can't | |
* sample this directly, as some CPUs may power down and not fire | |
* samples. | |
*/ | |
int idle = NRCPU - run; | |
if (DEBUG) { | |
bpf_trace_printk("report: run %d queue %d idle %d\\n", run, | |
queue, idle); | |
} | |
/* | |
* This should only happen when a sample has been massively | |
* delayed from its skip cycle and falls into the next record | |
* cycle, overcounting. I've seen this a couple of times under | |
* extreme load. For now, toss out samples. Note that it will | |
* be visible in the debug trace_pipe as an "idle -1" line. | |
*/ | |
if (run > NRCPU) | |
return 0; | |
/* | |
* Calculate the number of threads that could have run as the | |
* minimum of idle and queue, and emit this. | |
*/ | |
int i; | |
if (idle > 0 && queue > 0) { | |
if (queue > idle) | |
i = idle; | |
else | |
i = queue; | |
idx = S_POSITIVE; | |
val = stats.lookup(&idx); | |
if (val) (*val) += i; | |
} | |
// emit running & idle counts for calculating CPU utilization. | |
idx = S_RUNNING; | |
val = stats.lookup(&idx); | |
if (val) (*val) += run; | |
idx = S_IDLE; | |
val = stats.lookup(&idx); | |
if (val) (*val) += idle; | |
return 0; | |
} | |
} | |
if (now < (old + TRIGGER1)) { | |
// trailing samples in report cycle | |
if (DEBUG) | |
bpf_trace_printk("skip\\n"); | |
return 0; | |
} | |
/* | |
* Record cycle | |
*/ | |
/* | |
* Fetch the run queue length from task->se.cfs_rq->nr_running. This is an | |
* unstable interface and may need maintenance. Perhaps a future version | |
* of BPF will support task_rq(p) or something similar as a more reliable | |
* interface. | |
*/ | |
unsigned int len = 0; | |
struct task_struct *task = NULL; | |
struct cfs_rq_partial *my_q = NULL; | |
task = (struct task_struct *)bpf_get_current_task(); | |
bpf_probe_read(&my_q, sizeof(my_q), &task->se.cfs_rq); | |
bpf_probe_read(&len, sizeof(len), &my_q->nr_running); | |
if (DEBUG) { | |
bpf_trace_printk("record: len %d, delta %d\\n", len, | |
(now - (old + TRIGGER1))); | |
} | |
/* | |
* Maintain two global counters: | |
* | |
* - G_RUNNING: number of CPUs not idle | |
* - G_QUEUE: total number of queued threads not running | |
* | |
* Identify running CPUs as len > 0, and queued tasks as len > 1. | |
* A len == 0, or a missing sample, are idle CPUs. | |
*/ | |
if (len > 0) { | |
idx = G_RUNNING; | |
val = globals.lookup(&idx); | |
if (val) __sync_fetch_and_add((long *)val, 1); | |
} | |
if (len > 1) { | |
idx = G_QUEUE; | |
val = globals.lookup(&idx); | |
if (val) __sync_fetch_and_add((long *)val, len - 1); | |
} | |
return 0; | |
} | |
""" | |
# code substitutions | |
bpf_text = bpf_text.replace('NRCPU', str(multiprocessing.cpu_count())) | |
bpf_text = bpf_text.replace('TRIGGER1', str( | |
int(0.8 * (1000000000 / frequency)))) | |
bpf_text = bpf_text.replace('TRIGGER2', str( | |
int(1.8 * (1000000000 / frequency)))) | |
if debug: | |
print(bpf_text) | |
bpf_text = bpf_text.replace('DEBUG', '1') | |
bpf_text = bpf_text.replace('DEBUG', '0') | |
# bind to CPU 0 (temporary; see earlier XXX note) | |
if dobind: | |
ret = system("taskset -cp 0 %d >/dev/null" % getpid()) | |
if ret != 0: | |
print("WARNING: unable to run \"taskset\" to bind to CPU 0. " + | |
"This program may not work properly. " + | |
"Install taskset (util-linux)? Continuing anyway...") | |
# initialize BPF & perf_events | |
b = BPF(text=bpf_text) | |
b.attach_perf_event(ev_type=PerfType.SOFTWARE, | |
ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event", | |
sample_period=0, sample_freq=frequency) | |
print("Sampling run queues... Hit Ctrl-C to end.") | |
# stat indexes | |
S_POSITIVE = c_int(1) | |
S_RUNNING = c_int(2) | |
S_IDLE = c_int(3) | |
# output | |
exiting = 0 if args.interval else 1 | |
while (1): | |
try: | |
sleep(int(args.interval)) | |
except KeyboardInterrupt: | |
exiting = 1 | |
# fetch stats on queued when idle | |
positive = b["stats"][S_POSITIVE].value | |
running = b["stats"][S_RUNNING].value | |
idle = b["stats"][S_IDLE].value | |
total = running + idle | |
if debug: | |
print("DEBUG: positive %d running %d idle %d total %d" % (positive, | |
running, idle, total)) | |
if args.timestamp: | |
print("%-8s " % strftime("%H:%M:%S"), end="") | |
unclaimed = util = 0 | |
if total: | |
unclaimed = float(positive) / total | |
util = float(running) / total | |
print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util, | |
100 * unclaimed)) | |
b["stats"].clear() | |
countdown -= 1 | |
if exiting or countdown == 0: | |
exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment