Skip to content

Instantly share code, notes, and snippets.

@rmcgibbo
Created February 16, 2021 23:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rmcgibbo/65b42ee8b8c84831ced81c101ad66633 to your computer and use it in GitHub Desktop.
Save rmcgibbo/65b42ee8b8c84831ced81c101ad66633 to your computer and use it in GitHub Desktop.
Detect OOM and ENOSPC errors with BPF
#! /usr/bin/env nix-shell
#! nix-shell -i python3 -p python3 -p linuxPackages.bcc
#
# Copyright 2021 Robert T. McGibbon
# Licensed under the Apache License, Version 2.0 (the "License")
# Based on https://github.com/iovisor/bcc/blob/master/tools/oomkill.py
# which is Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
import sys
import json
from bcc import BPF
import time
SYSCALLS = [
"fsync",
"fallocate",
"write",
"writev",
"pwritev",
"pwritev2",
"mkdir",
"mkdirat",
"mknod",
"mknodat",
"open",
"openat",
"openat2",
"rename",
"renameat",
"renameat2",
]
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/oom.h>
struct data_t {
short etype;
u32 fpid;
u32 tpid;
char fcomm[TASK_COMM_LEN];
char tcomm[TASK_COMM_LEN];
};
BPF_PERF_OUTPUT(events);
void kprobe__oom_kill_process(struct pt_regs *ctx, struct oom_control *oc, const char *message)
{
struct task_struct *p = oc->chosen;
struct data_t data = {};
data.etype = 0;
data.fpid = (bpf_get_current_pid_tgid() >> 32);
data.tpid = p->pid;
bpf_get_current_comm(&data.fcomm, sizeof(data.fcomm));
bpf_probe_read_kernel(&data.tcomm, sizeof(data.tcomm), p->comm);
events.perf_submit(ctx, &data, sizeof(data));
}
int trace_return(struct pt_regs *args) {
int ret = PT_REGS_RC(args);
int errno = ret >= 0 ? 0 : -ret;
if (errno == 28) {
struct data_t data = {};
data.etype = 1;
data.fpid = (bpf_get_current_pid_tgid() >> 32);
bpf_get_current_comm(&data.fcomm, sizeof(data.fcomm));
events.perf_submit(args, &data, sizeof(data));
}
return 0;
}
"""
# process event
def print_event(cpu, data, size):
event = b["events"].event(data)
if event.etype == 0:
fmt = {
"time": time.time(),
"event": "OOM Kill",
"payload": {
"triggered_pid": event.fpid,
"triggered_process": event.fcomm.decode('utf-8', 'replace'),
"killed_pid": event.tpid,
"killed_process": event.tcomm.decode('utf-8', 'replace'),
}}
elif event.etype == 1:
fmt = {
"time": time.time(),
"event": "ENOSPC",
"payload": {
"pid": event.fpid,
"process": event.fcomm.decode('utf-8', 'replace'),
}}
else:
raise RuntimeError()
print(json.dumps(fmt))
sys.stdout.flush()
b = BPF(text=bpf_text)
for syscall in SYSCALLS:
syscall_fnname = b.get_syscall_fnname(syscall)
if BPF.ksymname(syscall_fnname) != -1:
b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
print(json.dumps({"time": time.time(), "event": "started"}))
sys.stdout.flush()
b["events"].open_perf_buffer(print_event)
while True:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment