Skip to content

Instantly share code, notes, and snippets.

@ethercflow
Created May 11, 2020 07:36
Show Gist options
  • Save ethercflow/16b661c6219beb67e1550bec4ff207e8 to your computer and use it in GitHub Desktop.
Save ethercflow/16b661c6219beb67e1550bec4ff207e8 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
from __future__ import print_function
from bcc import ArgString, BPF
from time import sleep
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct data {
u64 ts;
u32 prev_pid;
u32 next_pid;
int stack_id;
char prev_comm[TASK_COMM_LEN];
char next_comm[TASK_COMM_LEN];
};
BPF_PERF_OUTPUT(events);
BPF_STACK_TRACE(stack_traces, 16384);
static inline bool h_strcmp(const char *comm)
{
char filter[] = "cop_normal";
int len = sizeof(filter) - 1;
int i = 0;
for (; i < len; ++i) {
if (comm[i] != filter[i]) {
bpf_trace_printk("not equal, %d\\n", i);
return false;
}
}
return true;
}
TRACEPOINT_PROBE(sched, sched_switch) {
struct data data = {};
// if (args->prev_state & (TASK_REPORT_MAX - 1))
// return 0;
if (!(args->prev_state & TASK_REPORT_MAX))
return 0;
if (!h_strcmp(args->prev_comm))
return 0;
data.ts = bpf_ktime_get_ns();
data.prev_pid = args->prev_pid;
data.next_pid = args->next_pid;
data.stack_id = stack_traces.get_stackid(args, 0);
bpf_probe_read(&data.prev_comm, TASK_COMM_LEN, args->prev_comm);
bpf_probe_read(&data.next_comm, TASK_COMM_LEN, args->next_comm);
events.perf_submit(args, &data, sizeof(data));
return 0;
}
"""
b = BPF(text=bpf_text)
stack_traces = b.get_table("stack_traces")
# process event
def print_event(cpu, data, size):
event = b["events"].event(data)
print("%-24s %-14.14s %-6s -> %-14.14s %-6s" % (
event.ts,
event.prev_comm.decode("utf-8", "replace"),
event.prev_pid,
event.next_comm.decode("utf-8", "replace"),
event.next_pid,
))
for addr in stack_traces.walk(event.stack_id):
sym = b.ksym(addr, show_offset=True)
print("\t%s" % sym)
print("")
b["events"].open_perf_buffer(print_event, page_cnt=64)
while (1):
try:
sleep(1)
b.perf_buffer_poll()
except KeyboardInterrupt:
exit()
@ethercflow
Copy link
Author

工具用途:
trace 所有 cop_normal* 线程因被动抢占而让出 cpu 的事件 (注意,这里没有记录因 cpu 饱和导致的被动抢占,若需要包含此情况,则需开启 42、43行,并注释掉 45、46行),并记录时间戳,发生被动抢占时的 kernel stack,和后继线程信息。输出如下:

# timestamp (ns)       # prev_comm    # prev_pid    # next_comm  # next_pid
4255988076720371         cop_normal1      15153  -> kworker/22:3   25240
	__schedule+0x5d6
	__schedule+0x5d6
	preempt_schedule_common+0xe
	_cond_resched+0x22
	__kmalloc_node_track_caller+0x187
	__kmalloc_reserve.isra.58+0x46
	__alloc_skb+0xdf
	alloc_skb_with_frags+0x71
	sock_alloc_send_pskb+0x306
	unix_dgram_sendmsg+0x206
	sock_sendmsg+0xd5
	____sys_sendmsg+0x393
	___sys_sendmsg+0x9b
	__sys_sendmsg+0x97
	__x64_sys_sendmsg+0x28
	do_syscall_64+0x89
	entry_SYSCALL_64_after_hwframe+0x44

时间戳解析 (注意小数点位置):

~ date -d "1970-01-01 UTC `echo "$(date +%s)-$(cat /proc/uptime|cut -f 1 -d' ')+4255988.077460943"|bc ` seconds"
Mon May 11 03:55:34 EDT 2020

部署方式:
nohup ./ivcsw.py &

注意事项:

  1. 开销:
    虽然 bpf 本身开销很小,但调度器事件属于高频事件,累加起来会有一定的性能影响,之前在内部 benchmark 平台测试过 runqlat,会产生大约 2% 的性能会退 (实际损耗和负载有关系),因此在客户那边运行时,需要先关注一段时间业务指标。

  2. 第 85 行 sleep 1 是为了降低此工具带来的 context switch 开销,可酌情调整;

  3. 此工具会产生一定 IO,因此为避免干扰 TiKV,最好放在另外一个盘上;

  4. 产生的文件大小与事件频率和运行时长有关,需要关注下,避免爆盘;

  5. 若出现事件丢失情况,需酌情增大 22 行或 82 行的 page size;

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment