brendangregg/cpuunclaimed.py

## cpuunclaimed.py
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# cpuunclaimed   Sample CPU run queues and calculate unclaimed idle CPU.
#                For Linux, uses BCC, eBPF.
#
# This samples the length of the run queues and determine when there are idle
# CPUs, yet queued threads waiting their turn. Report the amount of idle
# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
#
# This situation can happen for a number of reasons:
# - An application has been bound to some, but not all, CPUs, and has runnable
#   threads that cannot migrate to other CPUs due to this configuration.
# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
#   caches are warm, even if this means short periods of waiting while other
#   CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
# - Scheduler bugs.
#
# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
# cause for concern. By leaving the CPU idle, overall throughput of the system
# may be improved. This tool is best for identifying larger issues, > 2%, due
# to the coarseness of its 99 Hertz samples.
#
# This is an experimental tool that currently works by use of sampling to
# keep overheads low, and also binding of this program to CPU 0. See the code
# comments. If this identifies unclaimed CPU, you should double check it by
# tracing scheduler events (although that approach has much higher overhead).
# Check for future versions of this tool which may have improvements.
#
# USAGE: cpuunclaimed [-h] [-T] [interval] [count]
#
# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
# a version of this tool that may work on Linux 4.6 - 4.8.
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 14-Dec-2016   Brendan Gregg   Created this.

from __future__ import print_function
from bcc import BPF, PerfType, PerfSWConfig
from time import sleep, strftime
from ctypes import c_int
import argparse
import multiprocessing
from os import getpid, system

# arguments
examples = """examples:
    ./cpuunclaimed            # summarize run queue length as a histogram
    ./cpuunclaimed 1 10       # print 1 second summaries, 10 times
    ./cpuunclaimed -T 1       # 1s summaries and timestamps
"""
parser = argparse.ArgumentParser(
    description="Summarize scheduler run queue length as a histogram",
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog=examples)
parser.add_argument("-T", "--timestamp", action="store_true",
    help="include timestamp on output")
parser.add_argument("interval", nargs="?", default=99999999,
    help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
    help="number of outputs")
args = parser.parse_args()
countdown = int(args.count)
frequency = 99 * 2      # 2 phases: recording, reporting
dobind = 1
debug = 0

# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

enum global_types {
    G_RUNNING = 1,
    G_QUEUE,
    G_TS,
    G_MAXGLOBAL
};

enum stat_types {
    S_POSITIVE = 1,
    S_RUNNING,
    S_IDLE,
    S_MAXSTAT
};

BPF_TABLE("array", int, u64, globals, G_MAXGLOBAL);
BPF_TABLE("array", int, u64, stats, S_MAXSTAT);

// Declare enough of cfs_rq to find nr_running, since we can't #import the
// header. This will need maintenance. It is from kernel/sched/sched.h:
struct cfs_rq_partial {
    struct load_weight load;
    unsigned int nr_running, h_nr_running;
};

int do_perf_event()
{
    int cpu = bpf_get_smp_processor_id();
    u64 now = bpf_ktime_get_ns();
    u64 *val = NULL;

    /*
     * Big Theory Statement
     *
     * This program determines if there are idle CPUs while others have
     * tasks waiting on their run queue. It updates summary counters that
     * are periodically read (every second) by user space. This is measured on
     * timed samples (eg 99 Hertz) so that, while coarse, this is a low
     * overhead approach, as the maximum rate of events is 99 x CPU_count,
     * per second. Tracing scheduler events (queueing or context switching
     * events) can have a rate of millions per second, and the overhead of this
     * can become undesirable for production use.
     *
     * Constraints:
     *
     * A) The state of all CPUs must be compared at the same point in time.
     * B) Without using loops.
     * C) Samples may not fire during idle (due to no_hz when idle).
     * D) Samples may execute in any order.
     * E) Samples may execute at the same time.
     *
     * The samples usually arrive in CPU sequence order, from 0 to max CPU, as
     * the samples are initialized in CPU order from a Python loop (in
     * attach_perf_event()), which introduces a small delay from when one CPU
     * timer is initialized to the next. However, higher priority interrupts
     * can delay when a sampling interrupt fires, so that under heavy load the
     * samples can begin to arrive in any order. A delayed sample causes the
     * next sample to arrive earlier (if possible), to return to the expected
     * time offset.
     *
     * (I've been tempted to stretch the delay between initializing sampling
     * timers to accomodate such delays, which would solve (D) and (E) and
     * simplify the program, but would break (A)).
     *
     * It is assumed that there is no drift between the CPU timers, such that
     * they always fire around the same expected time. This matches my
     * observation on one test system. If there is a system where this
     * assumption is not true, this sampling approach will not work once the
     * drift becomes significant (perhaps the tool would work for short
     * durations only).
     *
     * The following implementation is experimental, and currently does not
     * solve (C) completely. It works by alternating between two phases,
     * a recording phase that increments globals counters with run queue
     * state, and then a reporting phase that calculates queued when idle
     * and emits counters for user space. States alternate based on a stored
     * timestamp. It works like this:
     *
     * 1. Identify the first sample to arrive in the reporting cycle by
     *    looking for a jump in the saved timestamp by over 1.8 x sample
     *    period.
     * 1.1. Implement a tie-breaker so that, if multiple samples arrive
     *    at the exact same time, only one continues (see XXX).
     * 1.2. Set saved timestamp to now. See (2.).
     * 1.3. Read previous stored counters for running CPUs and queued threads,
     *    and calculate the number of threads that could have run.
     * 1.4. Emit values for user space.
     * 1.5. Return.
     * 2. Identify remaining threads in the reporting cycle as having a time
     *    jump of less than 0.8 x sample period, and return.
     * 3. Remaining threads are all now in the recording cycle (time is between
     *    0.8 and 1.8 sample periods from last report cycle).
     * 3.1. Fetch the CPU run queue length.
     * 3.2. If length is > 0, increment running global statistic by 1.
     * 3.3. If length is > 1, increment queued global statistic by the number
     *    of waiting threads.
     *
     * You are welcome to try to do better. Suggested workloads to test:
     *
     * - idle system                    # for testing constraint (C)
     * - taskset -c 1 multi-threaded-CPU-bound-program
     * - cd linux; make -j#CPUs
     * - cd linux; make -j#CPUs_x_8     # overload, for testing (D) and (E)
     *
     * With some bpf_trace_printk()s, you can inspect trace_pipe and see how
     * the logic is behaving, in particular, for missing events (C),
     * out-of-order events (D), and events at the same time (E).
     *
     * Perhaps we'll figure out a better tie-breaker routine (see XXX), and
     * this program will be improved. Perhaps there's a different approach,
     * one that doesn't need two phases, and can record data on every sample.
     * Or perhaps this BPF program should simply dump timestamped run queue
     * lengths to user-space, and we'll do it all there, paying the extra
     * overhead.
     */

    // cycles are keyed by the elapsed time since a stored timestamp
    int idx = G_TS;
    u64 *ts = globals.lookup(&idx);
    if (ts == NULL)
        return 0;   // sholudn't happen
    u64 old = *ts;
    if (old == 0) {
        // first hit
        *ts = now;
        return 0;
    } else {
        if (now > (old + TRIGGER2)) {
            /*
             * Report cycle
             */

            /*
             * Tie-breaker: only one thread should carry on from here.
             * XXX: We don't have __sync_val_compare_and_swap() or other
             * atomics, so I've yet to figure out a reliable way to do this.
             * As a temporary workaround, only allow CPU 0, although this
             * breaks constraint (C). As a small help to prevent CPU 0 from
             * powering down, this Python program is bound to CPU 0. See the
             * other XXX note.
             */
            if (cpu != 0)
               return 0;

            /*
             * Setup timestamp for the next cycle. Several other CPU may be
             * accessing *ts around the same time we change it here. What
             * happens is they either read the:
             *
             * - old time: it's up to the "Tie-breaker" logic above to only
             *   allow one CPU to carry on and do the report cycle.
             * - new time: the TRIGGER1 test will identify this state.
             *
             * Both paths lead to the other CPUs doing nothing on this cycle,
             * like we want. Everyone is then setup correctly for the record
             * cycle on the next sample.
             */
            *ts = now;

            // fetch, cache, then reset globals.
            idx = G_RUNNING; u64 *runp = globals.lookup(&idx);
            idx = G_QUEUE; u64 *queuep = globals.lookup(&idx);
            if (runp == NULL || queuep == NULL)
                return 0;   // shouldn't happen
            u64 run = *runp;
            u64 queue = *queuep;
            (*runp) = 0;
            (*queuep) = 0;

            /*
             * Infer idle CPU count as the CPU count minus running. We can't
             * sample this directly, as some CPUs may power down and not fire
             * samples.
             */
            int idle = NRCPU - run;

            if (DEBUG) {
                bpf_trace_printk("report: run %d queue %d idle %d\\n", run,
                    queue, idle);
            }

            /*
             * This should only happen when a sample has been massively
             * delayed from its skip cycle and falls into the next record
             * cycle, overcounting. I've seen this a couple of times under
             * extreme load. For now, toss out samples. Note that it will
             * be visible in the debug trace_pipe as an "idle -1" line.
             */
            if (run > NRCPU)
                return 0;

            /*
             * Calculate the number of threads that could have run as the
             * minimum of idle and queue, and emit this.
             */
            int i;
            if (idle > 0 && queue > 0) {
                if (queue > idle)
                    i = idle;
                else
                    i = queue;
                idx = S_POSITIVE;
                val = stats.lookup(&idx);
                if (val) (*val) += i;
            }

            // emit running & idle counts for calculating CPU utilization.
            idx = S_RUNNING;
            val = stats.lookup(&idx);
            if (val) (*val) += run;
            idx = S_IDLE;
            val = stats.lookup(&idx);
            if (val) (*val) += idle;

            return 0;
        }
    }

    if (now < (old + TRIGGER1)) {
        // trailing samples in report cycle
        if (DEBUG)
            bpf_trace_printk("skip\\n");
        return 0;
    }

    /*
     * Record cycle
     */

    /*
     * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
     * unstable interface and may need maintenance. Perhaps a future version
     * of BPF will support task_rq(p) or something similar as a more reliable
     * interface.
     */
    unsigned int len = 0;
    struct task_struct *task = NULL;
    struct cfs_rq_partial *my_q = NULL;
    task = (struct task_struct *)bpf_get_current_task();
    bpf_probe_read(&my_q, sizeof(my_q), &task->se.cfs_rq);
    bpf_probe_read(&len, sizeof(len), &my_q->nr_running);
    if (DEBUG) {
        bpf_trace_printk("record: len %d, delta %d\\n", len,
            (now - (old + TRIGGER1)));
    }

    /*
     * Maintain two global counters:
     *
     * - G_RUNNING: number of CPUs not idle
     * - G_QUEUE: total number of queued threads not running
     *
     * Identify running CPUs as len > 0, and queued tasks as len > 1.
     * A len == 0, or a missing sample, are idle CPUs.
     */
    if (len > 0) {
        idx = G_RUNNING;
        val = globals.lookup(&idx);
        if (val) __sync_fetch_and_add((long *)val, 1);
    }
    if (len > 1) {
        idx = G_QUEUE;
        val = globals.lookup(&idx);
        if (val) __sync_fetch_and_add((long *)val, len - 1);
    }

    return 0;
}
"""

# code substitutions
bpf_text = bpf_text.replace('NRCPU', str(multiprocessing.cpu_count()))
bpf_text = bpf_text.replace('TRIGGER1', str(
                            int(0.8 * (1000000000 / frequency))))
bpf_text = bpf_text.replace('TRIGGER2', str(
                            int(1.8 * (1000000000 / frequency))))
if debug:
    print(bpf_text)
    bpf_text = bpf_text.replace('DEBUG', '1')
bpf_text = bpf_text.replace('DEBUG', '0')

# bind to CPU 0 (temporary; see earlier XXX note)
if dobind:
    ret = system("taskset -cp 0 %d >/dev/null" % getpid())
    if ret != 0:
        print("WARNING: unable to run \"taskset\" to bind to CPU 0. " +
            "This program may not work properly. " +
            "Install taskset (util-linux)? Continuing anyway...")

# initialize BPF & perf_events
b = BPF(text=bpf_text)
b.attach_perf_event(ev_type=PerfType.SOFTWARE,
    ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
    sample_period=0, sample_freq=frequency)

print("Sampling run queues... Hit Ctrl-C to end.")

# stat indexes
S_POSITIVE = c_int(1)
S_RUNNING = c_int(2)
S_IDLE = c_int(3)

# output
exiting = 0 if args.interval else 1
while (1):
    try:
        sleep(int(args.interval))
    except KeyboardInterrupt:
        exiting = 1

    # fetch stats on queued when idle
    positive = b["stats"][S_POSITIVE].value
    running = b["stats"][S_RUNNING].value
    idle = b["stats"][S_IDLE].value
    total = running + idle

    if debug:
        print("DEBUG: positive %d running %d idle %d total %d" % (positive,
            running, idle, total))

    if args.timestamp:
        print("%-8s " % strftime("%H:%M:%S"), end="")

    unclaimed = util = 0
    if total:
        unclaimed = float(positive) / total
        util = float(running) / total
    print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
          100 * unclaimed))

    b["stats"].clear()

    countdown -= 1
    if exiting or countdown == 0:
        exit()
	#!/usr/bin/python
	# @lint-avoid-python-3-compatibility-imports
	#
	# cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU.
	# For Linux, uses BCC, eBPF.
	#
	# This samples the length of the run queues and determine when there are idle
	# CPUs, yet queued threads waiting their turn. Report the amount of idle
	# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
	#
	# This situation can happen for a number of reasons:
	# - An application has been bound to some, but not all, CPUs, and has runnable
	# threads that cannot migrate to other CPUs due to this configuration.
	# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
	# caches are warm, even if this means short periods of waiting while other
	# CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
	# - Scheduler bugs.
	#
	# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
	# cause for concern. By leaving the CPU idle, overall throughput of the system
	# may be improved. This tool is best for identifying larger issues, > 2%, due
	# to the coarseness of its 99 Hertz samples.
	#
	# This is an experimental tool that currently works by use of sampling to
	# keep overheads low, and also binding of this program to CPU 0. See the code
	# comments. If this identifies unclaimed CPU, you should double check it by
	# tracing scheduler events (although that approach has much higher overhead).
	# Check for future versions of this tool which may have improvements.
	#
	# USAGE: cpuunclaimed [-h] [-T] [interval] [count]
	#
	# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
	# a version of this tool that may work on Linux 4.6 - 4.8.
	#
	# Copyright 2016 Netflix, Inc.
	# Licensed under the Apache License, Version 2.0 (the "License")
	#
	# 14-Dec-2016 Brendan Gregg Created this.

	from __future__ import print_function
	from bcc import BPF, PerfType, PerfSWConfig
	from time import sleep, strftime
	from ctypes import c_int
	import argparse
	import multiprocessing
	from os import getpid, system

	# arguments
	examples = """examples:
	./cpuunclaimed # summarize run queue length as a histogram
	./cpuunclaimed 1 10 # print 1 second summaries, 10 times
	./cpuunclaimed -T 1 # 1s summaries and timestamps
	"""
	parser = argparse.ArgumentParser(
	description="Summarize scheduler run queue length as a histogram",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog=examples)
	parser.add_argument("-T", "--timestamp", action="store_true",
	help="include timestamp on output")
	parser.add_argument("interval", nargs="?", default=99999999,
	help="output interval, in seconds")
	parser.add_argument("count", nargs="?", default=99999999,
	help="number of outputs")
	args = parser.parse_args()
	countdown = int(args.count)
	frequency = 99 * 2 # 2 phases: recording, reporting
	dobind = 1
	debug = 0

	# define BPF program
	bpf_text = """
	#include <uapi/linux/ptrace.h>
	#include <linux/sched.h>

	enum global_types {
	G_RUNNING = 1,
	G_QUEUE,
	G_TS,
	G_MAXGLOBAL
	};

	enum stat_types {
	S_POSITIVE = 1,
	S_RUNNING,
	S_IDLE,
	S_MAXSTAT
	};

	BPF_TABLE("array", int, u64, globals, G_MAXGLOBAL);
	BPF_TABLE("array", int, u64, stats, S_MAXSTAT);

	// Declare enough of cfs_rq to find nr_running, since we can't #import the
	// header. This will need maintenance. It is from kernel/sched/sched.h:
	struct cfs_rq_partial {
	struct load_weight load;
	unsigned int nr_running, h_nr_running;
	};

	int do_perf_event()
	{
	int cpu = bpf_get_smp_processor_id();
	u64 now = bpf_ktime_get_ns();
	u64 *val = NULL;

	/*
	* Big Theory Statement
	*
	* This program determines if there are idle CPUs while others have
	* tasks waiting on their run queue. It updates summary counters that
	* are periodically read (every second) by user space. This is measured on
	* timed samples (eg 99 Hertz) so that, while coarse, this is a low
	* overhead approach, as the maximum rate of events is 99 x CPU_count,
	* per second. Tracing scheduler events (queueing or context switching
	* events) can have a rate of millions per second, and the overhead of this
	* can become undesirable for production use.
	*
	* Constraints:
	*
	* A) The state of all CPUs must be compared at the same point in time.
	* B) Without using loops.
	* C) Samples may not fire during idle (due to no_hz when idle).
	* D) Samples may execute in any order.
	* E) Samples may execute at the same time.
	*
	* The samples usually arrive in CPU sequence order, from 0 to max CPU, as
	* the samples are initialized in CPU order from a Python loop (in
	* attach_perf_event()), which introduces a small delay from when one CPU
	* timer is initialized to the next. However, higher priority interrupts
	* can delay when a sampling interrupt fires, so that under heavy load the
	* samples can begin to arrive in any order. A delayed sample causes the
	* next sample to arrive earlier (if possible), to return to the expected
	* time offset.
	*
	* (I've been tempted to stretch the delay between initializing sampling
	* timers to accomodate such delays, which would solve (D) and (E) and
	* simplify the program, but would break (A)).
	*
	* It is assumed that there is no drift between the CPU timers, such that
	* they always fire around the same expected time. This matches my
	* observation on one test system. If there is a system where this
	* assumption is not true, this sampling approach will not work once the
	* drift becomes significant (perhaps the tool would work for short
	* durations only).
	*
	* The following implementation is experimental, and currently does not
	* solve (C) completely. It works by alternating between two phases,
	* a recording phase that increments globals counters with run queue
	* state, and then a reporting phase that calculates queued when idle
	* and emits counters for user space. States alternate based on a stored
	* timestamp. It works like this:
	*
	* 1. Identify the first sample to arrive in the reporting cycle by
	* looking for a jump in the saved timestamp by over 1.8 x sample
	* period.
	* 1.1. Implement a tie-breaker so that, if multiple samples arrive
	* at the exact same time, only one continues (see XXX).
	* 1.2. Set saved timestamp to now. See (2.).
	* 1.3. Read previous stored counters for running CPUs and queued threads,
	* and calculate the number of threads that could have run.
	* 1.4. Emit values for user space.
	* 1.5. Return.
	* 2. Identify remaining threads in the reporting cycle as having a time
	* jump of less than 0.8 x sample period, and return.
	* 3. Remaining threads are all now in the recording cycle (time is between
	* 0.8 and 1.8 sample periods from last report cycle).
	* 3.1. Fetch the CPU run queue length.
	* 3.2. If length is > 0, increment running global statistic by 1.
	* 3.3. If length is > 1, increment queued global statistic by the number
	* of waiting threads.
	*
	* You are welcome to try to do better. Suggested workloads to test:
	*
	* - idle system # for testing constraint (C)
	* - taskset -c 1 multi-threaded-CPU-bound-program
	* - cd linux; make -j#CPUs
	* - cd linux; make -j#CPUs_x_8 # overload, for testing (D) and (E)
	*
	* With some bpf_trace_printk()s, you can inspect trace_pipe and see how
	* the logic is behaving, in particular, for missing events (C),
	* out-of-order events (D), and events at the same time (E).
	*
	* Perhaps we'll figure out a better tie-breaker routine (see XXX), and
	* this program will be improved. Perhaps there's a different approach,
	* one that doesn't need two phases, and can record data on every sample.
	* Or perhaps this BPF program should simply dump timestamped run queue
	* lengths to user-space, and we'll do it all there, paying the extra
	* overhead.
	*/

	// cycles are keyed by the elapsed time since a stored timestamp
	int idx = G_TS;
	u64 *ts = globals.lookup(&idx);
	if (ts == NULL)
	return 0; // sholudn't happen
	u64 old = *ts;
	if (old == 0) {
	// first hit
	*ts = now;
	return 0;
	} else {
	if (now > (old + TRIGGER2)) {
	/*
	* Report cycle
	*/

	/*
	* Tie-breaker: only one thread should carry on from here.
	* XXX: We don't have __sync_val_compare_and_swap() or other
	* atomics, so I've yet to figure out a reliable way to do this.
	* As a temporary workaround, only allow CPU 0, although this
	* breaks constraint (C). As a small help to prevent CPU 0 from
	* powering down, this Python program is bound to CPU 0. See the
	* other XXX note.
	*/
	if (cpu != 0)
	return 0;

	/*
	* Setup timestamp for the next cycle. Several other CPU may be
	* accessing *ts around the same time we change it here. What
	* happens is they either read the:
	*
	* - old time: it's up to the "Tie-breaker" logic above to only
	* allow one CPU to carry on and do the report cycle.
	* - new time: the TRIGGER1 test will identify this state.
	*
	* Both paths lead to the other CPUs doing nothing on this cycle,
	* like we want. Everyone is then setup correctly for the record
	* cycle on the next sample.
	*/
	*ts = now;

	// fetch, cache, then reset globals.
	idx = G_RUNNING; u64 *runp = globals.lookup(&idx);
	idx = G_QUEUE; u64 *queuep = globals.lookup(&idx);
	if (runp == NULL \|\| queuep == NULL)
	return 0; // shouldn't happen
	u64 run = *runp;
	u64 queue = *queuep;
	(*runp) = 0;
	(*queuep) = 0;

	/*
	* Infer idle CPU count as the CPU count minus running. We can't
	* sample this directly, as some CPUs may power down and not fire
	* samples.
	*/
	int idle = NRCPU - run;

	if (DEBUG) {
	bpf_trace_printk("report: run %d queue %d idle %d\\n", run,
	queue, idle);
	}

	/*
	* This should only happen when a sample has been massively
	* delayed from its skip cycle and falls into the next record
	* cycle, overcounting. I've seen this a couple of times under
	* extreme load. For now, toss out samples. Note that it will
	* be visible in the debug trace_pipe as an "idle -1" line.
	*/
	if (run > NRCPU)
	return 0;

	/*
	* Calculate the number of threads that could have run as the
	* minimum of idle and queue, and emit this.
	*/
	int i;
	if (idle > 0 && queue > 0) {
	if (queue > idle)
	i = idle;
	else
	i = queue;
	idx = S_POSITIVE;
	val = stats.lookup(&idx);
	if (val) (*val) += i;
	}

	// emit running & idle counts for calculating CPU utilization.
	idx = S_RUNNING;
	val = stats.lookup(&idx);
	if (val) (*val) += run;
	idx = S_IDLE;
	val = stats.lookup(&idx);
	if (val) (*val) += idle;

	return 0;
	}
	}

	if (now < (old + TRIGGER1)) {
	// trailing samples in report cycle
	if (DEBUG)
	bpf_trace_printk("skip\\n");
	return 0;
	}

	/*
	* Record cycle
	*/

	/*
	* Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
	* unstable interface and may need maintenance. Perhaps a future version
	* of BPF will support task_rq(p) or something similar as a more reliable
	* interface.
	*/
	unsigned int len = 0;
	struct task_struct *task = NULL;
	struct cfs_rq_partial *my_q = NULL;
	task = (struct task_struct *)bpf_get_current_task();
	bpf_probe_read(&my_q, sizeof(my_q), &task->se.cfs_rq);
	bpf_probe_read(&len, sizeof(len), &my_q->nr_running);
	if (DEBUG) {
	bpf_trace_printk("record: len %d, delta %d\\n", len,
	(now - (old + TRIGGER1)));
	}

	/*
	* Maintain two global counters:
	*
	* - G_RUNNING: number of CPUs not idle
	* - G_QUEUE: total number of queued threads not running
	*
	* Identify running CPUs as len > 0, and queued tasks as len > 1.
	* A len == 0, or a missing sample, are idle CPUs.
	*/
	if (len > 0) {
	idx = G_RUNNING;
	val = globals.lookup(&idx);
	if (val) __sync_fetch_and_add((long *)val, 1);
	}
	if (len > 1) {
	idx = G_QUEUE;
	val = globals.lookup(&idx);
	if (val) __sync_fetch_and_add((long *)val, len - 1);
	}

	return 0;
	}
	"""

	# code substitutions
	bpf_text = bpf_text.replace('NRCPU', str(multiprocessing.cpu_count()))
	bpf_text = bpf_text.replace('TRIGGER1', str(
	int(0.8 * (1000000000 / frequency))))
	bpf_text = bpf_text.replace('TRIGGER2', str(
	int(1.8 * (1000000000 / frequency))))
	if debug:
	print(bpf_text)
	bpf_text = bpf_text.replace('DEBUG', '1')
	bpf_text = bpf_text.replace('DEBUG', '0')

	# bind to CPU 0 (temporary; see earlier XXX note)
	if dobind:
	ret = system("taskset -cp 0 %d >/dev/null" % getpid())
	if ret != 0:
	print("WARNING: unable to run \"taskset\" to bind to CPU 0. " +
	"This program may not work properly. " +
	"Install taskset (util-linux)? Continuing anyway...")

	# initialize BPF & perf_events
	b = BPF(text=bpf_text)
	b.attach_perf_event(ev_type=PerfType.SOFTWARE,
	ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
	sample_period=0, sample_freq=frequency)

	print("Sampling run queues... Hit Ctrl-C to end.")

	# stat indexes
	S_POSITIVE = c_int(1)
	S_RUNNING = c_int(2)
	S_IDLE = c_int(3)

	# output
	exiting = 0 if args.interval else 1
	while (1):
	try:
	sleep(int(args.interval))
	except KeyboardInterrupt:
	exiting = 1

	# fetch stats on queued when idle
	positive = b["stats"][S_POSITIVE].value
	running = b["stats"][S_RUNNING].value
	idle = b["stats"][S_IDLE].value
	total = running + idle

	if debug:
	print("DEBUG: positive %d running %d idle %d total %d" % (positive,
	running, idle, total))

	if args.timestamp:
	print("%-8s " % strftime("%H:%M:%S"), end="")

	unclaimed = util = 0
	if total:
	unclaimed = float(positive) / total
	util = float(running) / total
	print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
	100 * unclaimed))

	b["stats"].clear()

	countdown -= 1
	if exiting or countdown == 0:
	exit()