Created
September 18, 2018 08:47
-
-
Save rincebrain/978532d8da427c39642c11a680dc180e to your computer and use it in GitHub Desktop.
zfswrites.py hackjob
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# @lint-avoid-python-3-compatibility-imports | |
# | |
# zfsslower Trace slow ZFS operations. | |
# For Linux, uses BCC, eBPF. | |
# | |
# USAGE: zfsslower [-h] [-j] [-p PID] [min_ms] | |
# | |
# This script traces common ZFS file operations: reads, writes, opens, and | |
# syncs. It measures the time spent in these operations, and prints details | |
# for each that exceeded a threshold. | |
# | |
# WARNING: This adds low-overhead instrumentation to these ZFS operations, | |
# including reads and writes from the file system cache. Such reads and writes | |
# can be very frequent (depending on the workload; eg, 1M/sec), at which | |
# point the overhead of this tool (even if it prints no "slower" events) can | |
# begin to become significant. | |
# | |
# This works by using kernel dynamic tracing of the ZPL interface, and will | |
# need updates to match any changes to this interface. | |
# | |
# By default, a minimum millisecond threshold of 10 is used. | |
# | |
# Copyright 2016 Netflix, Inc. | |
# Licensed under the Apache License, Version 2.0 (the "License") | |
# | |
# 14-Feb-2016 Brendan Gregg Created this. | |
# 16-Oct-2016 Dina Goldshtein -p to filter by process ID. | |
from __future__ import print_function | |
from bcc import BPF | |
import argparse | |
from time import strftime | |
import ctypes as ct | |
# arguments | |
examples = """examples: | |
./zfsslower # trace operations slower than 10 ms (default) | |
./zfsslower 1 # trace operations slower than 1 ms | |
./zfsslower -j 1 # ... 1 ms, parsable output (csv) | |
./zfsslower 0 # trace all operations (warning: verbose) | |
./zfsslower -p 185 # trace PID 185 only | |
""" | |
parser = argparse.ArgumentParser( | |
description="Trace common ZFS file operations slower than a threshold", | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog=examples) | |
parser.add_argument("-j", "--csv", action="store_true", | |
help="just print fields: comma-separated values") | |
parser.add_argument("-p", "--pid", | |
help="trace this PID only") | |
parser.add_argument("min_ms", nargs="?", default='10', | |
help="minimum I/O duration to trace, in ms (default 10)") | |
parser.add_argument("--ebpf", action="store_true", | |
help=argparse.SUPPRESS) | |
args = parser.parse_args() | |
min_ms = int(args.min_ms) | |
pid = args.pid | |
csv = args.csv | |
debug = 0 | |
# define BPF program | |
bpf_text = """ | |
#include <uapi/linux/ptrace.h> | |
#include <linux/fs.h> | |
#include <linux/sched.h> | |
#include <linux/mount.h> | |
#include <linux/dcache.h> | |
// XXX: switch these to char's when supported | |
#define TRACE_READ 0 | |
#define TRACE_WRITE 1 | |
#define TRACE_OPEN 2 | |
#define TRACE_FSYNC 3 | |
struct val_t { | |
u64 ts; | |
u64 offset; | |
struct file *fp; | |
}; | |
struct data_t { | |
// XXX: switch some to u32's when supported | |
u64 ts_us; | |
u64 type; | |
u64 size; | |
u64 offset; | |
u64 delta_us; | |
u64 pid; | |
char task[TASK_COMM_LEN]; | |
char dataset[DNAME_INLINE_LEN]; | |
char file[DNAME_INLINE_LEN]; | |
}; | |
BPF_HASH(entryinfo, u64, struct val_t); | |
BPF_PERF_OUTPUT(events); | |
// | |
// Store timestamp and size on entry | |
// | |
// zpl_read(), zpl_write(): | |
int trace_rw_entry(struct pt_regs *ctx, struct file *filp, char __user *buf, | |
size_t len, loff_t *ppos) | |
{ | |
u64 id = bpf_get_current_pid_tgid(); | |
u32 pid = id >> 32; // PID is higher part | |
if (FILTER_PID) | |
return 0; | |
// store filep and timestamp by id | |
struct val_t val = {}; | |
val.ts = bpf_ktime_get_ns(); | |
val.fp = filp; | |
val.offset = *ppos; | |
if (val.fp) | |
entryinfo.update(&id, &val); | |
return 0; | |
} | |
// zpl_open(): | |
int trace_open_entry(struct pt_regs *ctx, struct inode *inode, | |
struct file *filp) | |
{ | |
u64 id = bpf_get_current_pid_tgid(); | |
u32 pid = id >> 32; // PID is higher part | |
if (FILTER_PID) | |
return 0; | |
// store filep and timestamp by id | |
struct val_t val = {}; | |
val.ts = bpf_ktime_get_ns(); | |
val.fp = filp; | |
val.offset = 0; | |
if (val.fp) | |
entryinfo.update(&id, &val); | |
return 0; | |
} | |
// zpl_fsync(): | |
int trace_fsync_entry(struct pt_regs *ctx, struct file *filp) | |
{ | |
u64 id = bpf_get_current_pid_tgid(); | |
u32 pid = id >> 32; // PID is higher part | |
if (FILTER_PID) | |
return 0; | |
// store filp and timestamp by id | |
struct val_t val = {}; | |
val.ts = bpf_ktime_get_ns(); | |
val.fp = filp; | |
val.offset = 0; | |
if (val.fp) | |
entryinfo.update(&id, &val); | |
return 0; | |
} | |
// | |
// Output | |
// | |
static int trace_return(struct pt_regs *ctx, int type) | |
{ | |
struct val_t *valp; | |
u64 id = bpf_get_current_pid_tgid(); | |
u32 pid = id >> 32; // PID is higher part | |
valp = entryinfo.lookup(&id); | |
if (valp == 0) { | |
// missed tracing issue or filtered | |
return 0; | |
} | |
// calculate delta | |
u64 ts = bpf_ktime_get_ns(); | |
u64 delta_us = (ts - valp->ts) / 1000; | |
entryinfo.delete(&id); | |
if (FILTER_US) | |
return 0; | |
// populate output struct | |
u32 size = PT_REGS_RC(ctx); | |
struct data_t data = {.type = type, .size = size, .delta_us = delta_us, | |
.pid = pid}; | |
data.ts_us = ts / 1000; | |
data.offset = valp->offset; | |
bpf_get_current_comm(&data.task, sizeof(data.task)); | |
// workaround (rewriter should handle file to d_name in one step): | |
struct dentry *de = NULL; | |
struct path *mnt_path = NULL; | |
struct vfsmount *wrk = NULL; | |
struct qstr mnt = {}; | |
struct qstr qs = {}; | |
bpf_probe_read(&wrk, sizeof(wrk), &valp->fp->f_path.mnt); | |
// bpf_probe_read(&de_wrk, sizeof(de_wrk), &wrk->mnt_root); | |
// bpf_probe_read(&mnt_path, sizeof(mnt_path), &mntget(wrk)); | |
bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry); | |
bpf_probe_read(&qs, sizeof(qs), (void *)&de->d_name); | |
if (qs.len == 0) | |
return 0; | |
if (mnt.len != 0) | |
bpf_probe_read(&data.dataset, sizeof(data.dataset), (void *)mnt.name); | |
bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name); | |
// output | |
events.perf_submit(ctx, &data, sizeof(data)); | |
return 0; | |
} | |
int trace_read_return(struct pt_regs *ctx) | |
{ | |
return trace_return(ctx, TRACE_READ); | |
} | |
int trace_write_return(struct pt_regs *ctx) | |
{ | |
return trace_return(ctx, TRACE_WRITE); | |
} | |
int trace_open_return(struct pt_regs *ctx) | |
{ | |
return trace_return(ctx, TRACE_OPEN); | |
} | |
int trace_fsync_return(struct pt_regs *ctx) | |
{ | |
return trace_return(ctx, TRACE_FSYNC); | |
} | |
""" | |
if min_ms == 0: | |
bpf_text = bpf_text.replace('FILTER_US', '0') | |
else: | |
bpf_text = bpf_text.replace('FILTER_US', | |
'delta_us <= %s' % str(min_ms * 1000)) | |
if args.pid: | |
bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) | |
else: | |
bpf_text = bpf_text.replace('FILTER_PID', '0') | |
if debug or args.ebpf: | |
print(bpf_text) | |
if args.ebpf: | |
exit() | |
# kernel->user event data: struct data_t | |
DNAME_INLINE_LEN = 32 # linux/dcache.h | |
TASK_COMM_LEN = 16 # linux/sched.h | |
class Data(ct.Structure): | |
_fields_ = [ | |
("ts_us", ct.c_ulonglong), | |
("type", ct.c_ulonglong), | |
("size", ct.c_ulonglong), | |
("offset", ct.c_ulonglong), | |
("delta_us", ct.c_ulonglong), | |
("pid", ct.c_ulonglong), | |
("task", ct.c_char * TASK_COMM_LEN), | |
("dataset", ct.c_char * DNAME_INLINE_LEN), | |
("file", ct.c_char * DNAME_INLINE_LEN) | |
] | |
# process event | |
def print_event(cpu, data, size): | |
event = ct.cast(data, ct.POINTER(Data)).contents | |
type = 'R' | |
if event.type == 1: | |
type = 'W' | |
elif event.type == 2: | |
type = 'O' | |
elif event.type == 3: | |
type = 'S' | |
if (csv): | |
print("%d,%s,%d,%s,%d,%d,%d,%s" % ( | |
event.ts_us, event.task, event.pid, type, event.size, | |
event.offset, event.delta_us, event.file)) | |
return | |
print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s %s" % (strftime("%H:%M:%S"), | |
event.task, event.pid, type, event.size, event.offset / 1024, | |
float(event.delta_us) / 1000, event.dataset, event.file)) | |
# initialize BPF | |
b = BPF(text=bpf_text) | |
# common file functions | |
if BPF.get_kprobe_functions('zpl_iter'): | |
b.attach_kprobe(event="zpl_iter_read", fn_name="trace_rw_entry") | |
b.attach_kprobe(event="zpl_iter_write", fn_name="trace_rw_entry") | |
elif BPF.get_kprobe_functions('zpl_aio'): | |
b.attach_kprobe(event="zpl_aio_read", fn_name="trace_rw_entry") | |
b.attach_kprobe(event="zpl_aio_write", fn_name="trace_rw_entry") | |
else: | |
b.attach_kprobe(event="zpl_read", fn_name="trace_rw_entry") | |
b.attach_kprobe(event="zpl_write", fn_name="trace_rw_entry") | |
b.attach_kprobe(event="zpl_open", fn_name="trace_open_entry") | |
b.attach_kprobe(event="zpl_fsync", fn_name="trace_fsync_entry") | |
if BPF.get_kprobe_functions('zpl_iter'): | |
b.attach_kretprobe(event="zpl_iter_read", fn_name="trace_read_return") | |
b.attach_kretprobe(event="zpl_iter_write", fn_name="trace_write_return") | |
elif BPF.get_kprobe_functions('zpl_aio'): | |
b.attach_kretprobe(event="zpl_aio_read", fn_name="trace_read_return") | |
b.attach_kretprobe(event="zpl_aio_write", fn_name="trace_write_return") | |
else: | |
b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return") | |
b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return") | |
b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return") | |
b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return") | |
# header | |
if (csv): | |
print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,DSET,FILE") | |
else: | |
if min_ms == 0: | |
print("Tracing ZFS operations") | |
else: | |
print("Tracing ZFS operations slower than %d ms" % min_ms) | |
print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s %s" % ("TIME", "COMM", "PID", "T", | |
"BYTES", "OFF_KB", "LAT(ms)", "DSET", "FILENAME")) | |
# read events | |
b["events"].open_perf_buffer(print_event, page_cnt=64) | |
while 1: | |
b.kprobe_poll() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment