Skip to content

Instantly share code, notes, and snippets.

@iamahuman
Last active January 1, 2022 16:59
Show Gist options
  • Save iamahuman/ecd1ec772f8e5ec01ed8adabec6bd794 to your computer and use it in GitHub Desktop.
Save iamahuman/ecd1ec772f8e5ec01ed8adabec6bd794 to your computer and use it in GitHub Desktop.
openat() tracer
#define _GNU_SOURCE
#define UNW_LOCAL_ONLY
#include <stddef.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <signal.h>
#include <linux/audit.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <linux/futex.h>
#include <sys/time.h>
#include <sys/prctl.h>
#include <sys/uio.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <ucontext.h>
#include <libunwind.h>
#include <pthread.h>
#include <assert.h>
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(*(arr)))
#define container_of(ptr, type, member) ((type *)((unsigned char *)(ptr) - offsetof(type, member)))
_Static_assert(__builtin_types_compatible_p(unw_context_t, ucontext_t),
"unw_context_t is not equivalent to ucontext_t");
static void abort_with_error(const char *prefix, size_t prefix_len, const char *message)
{
struct iovec vecs[] = {
{ .iov_base = (void *)prefix, .iov_len = prefix_len },
{ .iov_base = (void *)message, .iov_len = strlen(message) },
{ .iov_base = (void *)"\n", .iov_len = 1 },
};
if (getenv("dbg")) __asm__ __volatile__("int $3" ::: "memory");
writev(STDERR_FILENO, vecs, ARRAY_SIZE(vecs));
abort();
}
#define FAIL(prefix, message) abort_with_error((prefix), strlen(prefix), (message))
static char hexdigit(unsigned int value)
{
return value + (value > 9 ? 'a' - 10 : '0');
}
static void print_backtrace_entry(const char *name, unsigned long offset)
{
char buffer[32], *ptr, *endptr;
unsigned long num;
struct iovec vecs[3] = {
{ .iov_base = (void *)"\t", .iov_len = 1 },
};
ptr = endptr = buffer + ARRAY_SIZE(buffer);
*--ptr = '\n';
num = offset;
do *--ptr = hexdigit(num & 0xf);
while ((num >>= 4) != 0);
ptr -= 2;
memcpy(ptr, "0x", 2);
if (name) {
*--ptr = '+';
vecs[1].iov_base = (void *)name;
vecs[1].iov_len = strlen(name);
}
vecs[2].iov_base = (void *)ptr;
vecs[2].iov_len = endptr - ptr;
writev(STDERR_FILENO, vecs, ARRAY_SIZE(vecs));
}
static pthread_key_t nested_signal_key;
#define MCTX_REG_R8 40
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R8 ]) == MCTX_REG_R8);
#define MCTX_REG_R9 48
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R9 ]) == MCTX_REG_R9);
#define MCTX_REG_R10 56
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R10]) == MCTX_REG_R10);
#define MCTX_REG_R11 64
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R11]) == MCTX_REG_R11);
#define MCTX_REG_R12 72
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R12]) == MCTX_REG_R12);
#define MCTX_REG_R13 80
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R13]) == MCTX_REG_R13);
#define MCTX_REG_R14 88
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R14]) == MCTX_REG_R14);
#define MCTX_REG_R15 96
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R15]) == MCTX_REG_R15);
#define MCTX_REG_RDI 104
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RDI]) == MCTX_REG_RDI);
#define MCTX_REG_RSI 112
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RSI]) == MCTX_REG_RSI);
#define MCTX_REG_RBP 120
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RBP]) == MCTX_REG_RBP);
#define MCTX_REG_RBX 128
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RBX]) == MCTX_REG_RBX);
#define MCTX_REG_RDX 136
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RDX]) == MCTX_REG_RDX);
#define MCTX_REG_RAX 144
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RAX]) == MCTX_REG_RAX);
#define MCTX_REG_RCX 152
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RCX]) == MCTX_REG_RCX);
#define MCTX_REG_RSP 160
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RSP]) == MCTX_REG_RSP);
#define MCTX_REG_RIP 168
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RIP]) == MCTX_REG_RIP);
#define STRINGIFY(x) #x
#if defined(__GCC_HAVE_DWARF2_CFI_ASM) || defined(__clang__)
#define ASM_CFI(str) str
#else
#define ASM_CFI(str)
#endif
#define FOREACH6_FWD(fn, oi, r0, o0, r1, o1, r2, o2, r3, o3, r4, o4, r5, o5) fn(r0, o0) fn(r1, o1) fn(r2, o2) fn(r3, o3) fn(r4, o4) fn(r5, o5)
#define FOREACH6_REV(fn, oi, r0, o0, r1, o1, r2, o2, r3, o3, r4, o4, r5, o5) fn(r5, o4) fn(r4, o3) fn(r3, o2) fn(r2, o1) fn(r1, o0) fn(r0, oi)
#define APPLY_CALLEE_SAVE(foreach, fn) foreach(fn, "8", "r15", "16", "r14", "24", "r13", "32", "r12", "40", "rbp", "48", "rbx", "56")
#define FOR_EACH_REG_PART1(f) \
f(MCTX_REG_R8 , "%r8" ) f(MCTX_REG_R9 , "%r9" ) f(MCTX_REG_R10, "%r10") \
f(MCTX_REG_R12, "%r12") f(MCTX_REG_R13, "%r13") f(MCTX_REG_R14, "%r14") \
f(MCTX_REG_R15, "%r15")
#define FOR_EACH_REG_PART2(f) f(MCTX_REG_RDI, "%rdi")
#define FOR_EACH_REG_PART3(f) \
f(MCTX_REG_RSI, "%rsi") f(MCTX_REG_RBP, "%rbp") \
f(MCTX_REG_RBX, "%rbx") f(MCTX_REG_RDX, "%rdx")
#define UCTX_LOAD_REG(x, y) "movq " STRINGIFY(x) "(%rdi), " y "\n\t"
#define UCTX_STORE_REG(x, y) "movq " y ", " STRINGIFY(x) "(%rcx)\n\t"
#define PUSH_REG(reg, offset) "pushq %" reg "\n\t" ASM_CFI(".cfi_def_cfa_offset " offset "\n\t.cfi_offset " reg ", -" offset "\n\t")
#define PUSH_REG_EH_ONLY(reg, offset) ASM_CFI(".cfi_offset " reg ", -" offset "\n\t")
#define POP_REG(reg, offset) "popq %" reg "\n\t" ASM_CFI(".cfi_restore " reg "\n\t.cfi_def_cfa_offset " offset "\n\t")
#if defined(__CET__) && ((__CET__) & 1)
#define BRANCH_TARGET_MARKER() "endbr64\n\t"
#else
#define BRANCH_TARGET_MARKER()
#endif
#define SYSCALL_STUB_PROLOGUE() \
ASM_CFI(".cfi_startproc\n\t") \
BRANCH_TARGET_MARKER() \
APPLY_CALLEE_SAVE(FOREACH6_FWD, PUSH_REG) \
PUSH_REG("rdi", "64") \
"movq %rsi, %rax" \
ASM_CFI("\n\t.cfi_register rsi, rax")
#define SYSCALL_STUB_BODY() \
FOR_EACH_REG_PART1(UCTX_LOAD_REG) \
FOR_EACH_REG_PART3(UCTX_LOAD_REG) \
FOR_EACH_REG_PART2(UCTX_LOAD_REG) \
"syscall"
#define SYSCALL_STUB_EPILOGUE() \
"popq %rcx\n\t" \
ASM_CFI(".cfi_register rdi, rcx\n\t") \
ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
FOR_EACH_REG_PART1(UCTX_STORE_REG) \
FOR_EACH_REG_PART2(UCTX_STORE_REG) \
FOR_EACH_REG_PART3(UCTX_STORE_REG) \
UCTX_STORE_REG(MCTX_REG_RAX, "%rax") \
APPLY_CALLEE_SAVE(FOREACH6_REV, POP_REG) \
"ret" \
ASM_CFI("\n\t.cfi_endproc")
__attribute__((visibility("hidden")))
extern unsigned long execute_syscall_clone(ucontext_t *uc, unsigned long orig_syscall);
__attribute__((visibility("hidden")))
extern void seccomp_exempt_address_clone(void);
__attribute__((visibility("hidden")))
extern void syscall_return_clone(void);
__asm__(
".pushsection \".text\", \"ax\"\n\t"
".type execute_syscall_clone, @function\n"
"execute_syscall_clone:\n\t"
SYSCALL_STUB_PROLOGUE() "\n\t"
UCTX_LOAD_REG(MCTX_REG_RIP, "%xmm0")
"movq %rsp, %xmm1\n\t"
SYSCALL_STUB_BODY() "\n\t"
ASM_CFI(".cfi_endproc\n") /* first FDE ends here: RSP may change if using clone() */
"seccomp_exempt_address_clone:\n\t"
"movq %xmm1, %rcx\n\t"
"cmpq %rcx, %rsp\n\t"
"je syscall_return_normal\n\t"
"movq %xmm0, %rcx\n"
"syscall_return_clone:\n\t"
ASM_CFI(".cfi_startproc simple\n\t") /* disable default frame instructions */
ASM_CFI(".cfi_def_cfa rsp, 0\n\t") /* nothing pushed on stack */
ASM_CFI(".cfi_return_column rcx\n\t") /* pop-less return */
"jmp *%rcx\n"
ASM_CFI("\t.cfi_endproc\n")
"syscall_return_normal:\n\t"
ASM_CFI(".cfi_startproc simple\n\t") /* second FDE starts here: RSP has been verified */
ASM_CFI(".cfi_def_cfa rsp, 64\n\t")
ASM_CFI(".cfi_offset rip, -8\n\t")
APPLY_CALLEE_SAVE(FOREACH6_FWD, PUSH_REG_EH_ONLY)
PUSH_REG_EH_ONLY("rdi", "64")
SYSCALL_STUB_EPILOGUE() "\n\t"
".size execute_syscall_clone, .-execute_syscall_clone\n\t"
".popsection"
);
__attribute__((visibility("hidden")))
extern unsigned long execute_syscall_noclone(ucontext_t *uc, unsigned long orig_syscall);
__attribute__((visibility("hidden")))
extern void seccomp_exempt_address_noclone(void);
__asm__(
".pushsection \".text\", \"ax\"\n\t"
".type execute_syscall_noclone, @function\n"
"execute_syscall_noclone:\n\t"
SYSCALL_STUB_PROLOGUE() "\n\t"
SYSCALL_STUB_BODY() "\n"
"seccomp_exempt_address_noclone:\n\t"
SYSCALL_STUB_EPILOGUE() "\n\t"
".size execute_syscall_noclone, .-execute_syscall_noclone\n\t"
".popsection"
);
__attribute__((visibility("hidden"), noreturn))
extern void restore_context_trampoline(void *ucontext);
__asm__(
".pushsection \".text\", \"ax\"\n\t"
".type restore_context_trampoline, @function\n"
"restore_context_trampoline:\n\t"
ASM_CFI(".cfi_startproc\n\t")
BRANCH_TARGET_MARKER()
"mov %rdi, %rsp\n\t" /* exploit red zone guarantee of x86-64 ABI */
ASM_CFI(".cfi_endproc\n\t") /* switching stacks: split FDE */
ASM_CFI(".cfi_startproc simple\n\t") /* disable default frame insns */
ASM_CFI(".cfi_def_cfa rsp, 0\n\t") /* retaddr not above rsp */
ASM_CFI(".cfi_offset rip, -8\n\t") /* retaddr = &restore_rt (glibc) */
"jmpq *-8(%rsp)\n\t" /* don't use "ret" (bypass shadow stack) */
ASM_CFI(".cfi_endproc\n\t")
".size restore_context_trampoline, .-restore_context_trampoline\n\t"
".popsection"
);
struct queue_item
{
struct queue_item *next;
unsigned int next_set;
unsigned int done;
};
struct lockfree_queue
{
struct queue_item *tail;
struct queue_item head;
};
struct syscall_request
{
struct queue_item queue_item;
unsigned long nr;
ucontext_t *uc;
};
#define DEFINE_LOCKFREE_QUEUE(name) struct lockfree_queue name = { &name.head }
static DEFINE_LOCKFREE_QUEUE(noseccomp_syscall_queue);
static pthread_t queue_thread;
static void enqueue_and_wait(struct lockfree_queue *queue, struct queue_item *item)
{
struct queue_item *prev;
prev = __atomic_exchange_n(&queue->tail, item, __ATOMIC_ACQ_REL);
prev->next = item;
__atomic_store_n(&prev->next_set, 1, __ATOMIC_RELEASE);
syscall(__NR_futex, &prev->next_set, FUTEX_WAKE, INT_MAX, 0, 0, 0);
while (!__atomic_load_n(&item->done, __ATOMIC_ACQUIRE))
syscall(__NR_futex, &item->done, FUTEX_WAIT, 0, 0, 0, 0);
}
static void issue_syscall_request(struct lockfree_queue *queue, int nr, ucontext_t *uc)
{
struct syscall_request req = { { NULL }, nr, uc };
enqueue_and_wait(queue, &req.queue_item);
}
static int next_queue_item_internal(struct lockfree_queue *queue, struct queue_item **cursor, struct queue_item *curr, int flag, const struct timespec *timeout)
{
while (!__atomic_load_n(&curr->next_set, __ATOMIC_ACQUIRE))
syscall(__NR_futex, &curr->next_set, FUTEX_WAIT | flag, (unsigned long)timeout, 0, 0, 0);
*cursor = curr->next;
return 0;
}
static int next_queue_item(struct lockfree_queue *queue, struct queue_item **cursor, int flag, const struct timespec *timeout)
{
if (*cursor != NULL)
return 0;
return next_queue_item_internal(queue, cursor, &queue->head, flag, timeout);
}
static int ack_queue_item(struct lockfree_queue *queue, struct queue_item **cursor, int flag, const struct timespec *timeout)
{
struct queue_item *curr = *cursor, *expect;
int result = 0;
if (curr == NULL) /* no item */
return 0;
queue->head.next = NULL;
__atomic_store_n(&queue->head.next_set, 0, __ATOMIC_RELEASE);
expect = curr;
if (!__atomic_compare_exchange_n(&queue->tail, &expect, &queue->head, 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) {
assert(expect != &queue->head);
/* Queue is nonempty; get next item before releasing the current one */
result = next_queue_item_internal(queue, cursor, curr, flag, timeout);
} else {
/* Queue is empty */
*cursor = NULL;
}
__atomic_store_n(&curr->done, 1, __ATOMIC_RELEASE);
syscall(__NR_futex, &curr->done, FUTEX_WAKE, INT_MAX, 0, 0, 0);
return result;
}
static void *membitor(void *dst, const void *src1, const void *src2, size_t len)
{
unsigned char *a = dst;
const unsigned char *b = src1;
const unsigned char *c = src2;
size_t i;
for (i = 0; i < len; i++)
a[i] = b[i] | c[i];
return dst;
}
static void *membitandn(void *dst, const void *src1, const void *src2, size_t len)
{
unsigned char *a = dst;
const unsigned char *b = src1;
const unsigned char *c = src2;
size_t i;
for (i = 0; i < len; i++)
a[i] = b[i] & ~c[i];
return dst;
}
#define MESSAGE(x) write(STDERR_FILENO, (x), sizeof(x) - 1);
static int mem_overlaps(const void *a, size_t an, const void *b, size_t bn)
{
unsigned long ap = (unsigned long)a;
unsigned long bp = (unsigned long)b;
return ap + an > bp && bp + bn > an;
}
static void handle_sigprocmask(unsigned int orig_syscall, ucontext_t *uc)
{
unsigned long ret;
unsigned long how = uc->uc_mcontext.gregs[REG_RDI];
void *set = (void *)uc->uc_mcontext.gregs[REG_RSI];
void *oldset = (void *)uc->uc_mcontext.gregs[REG_RDX];
size_t sigsetsize = (size_t)uc->uc_mcontext.gregs[REG_R10];
sigset_t newset;
int err;
if (orig_syscall != __NR_rt_sigprocmask) {
sigsetsize = sizeof(unsigned long); /* old_sigset_t */
} else if (sigsetsize > sizeof(newset)) {
uc->uc_mcontext.gregs[REG_RAX] = -EINVAL;
return;
}
/* do a test run to catch -EFAULT etc. first try (NULL oldset) */
if (set && oldset && mem_overlaps(set, sigsetsize, oldset, sigsetsize)) {
uc->uc_mcontext.gregs[REG_RDI] = SIG_BLOCK;
uc->uc_mcontext.gregs[REG_RDX] = 0;
ret = execute_syscall_noclone(uc, orig_syscall);
if ((long)ret < 0)
return;
uc->uc_mcontext.gregs[REG_RDI] = how;
uc->uc_mcontext.gregs[REG_RDX] = (unsigned long)oldset;
}
err = 0;
if (set) {
switch ((int)how) {
case SIG_BLOCK:
membitor(&newset, &uc->uc_sigmask, set, sigsetsize);
break;
case SIG_UNBLOCK:
membitandn(&newset, &uc->uc_sigmask, set, sigsetsize);
break;
case SIG_SETMASK:
memcpy(&newset, set, sigsetsize);
break;
default:
err = -EINVAL;
break;
}
/* Always unblock SIGSYS */
sigdelset(&newset, SIGSYS);
}
uc->uc_mcontext.gregs[REG_RAX] = err;
/* do a test run to catch -EFAULT etc. second try (full) */
ret = execute_syscall_noclone(uc, orig_syscall);
if ((long)ret < 0)
return;
if (oldset)
memcpy(oldset, &uc->uc_sigmask, sigsetsize);
if (set)
memcpy(&uc->uc_sigmask, &newset, sigsetsize);
}
static unsigned long execute_syscall(ucontext_t *uc, unsigned long orig_syscall)
{
if ((orig_syscall == __NR_clone && uc->uc_mcontext.gregs[REG_RSI]) ||
orig_syscall == __NR_clone3)
return execute_syscall_clone(uc, orig_syscall);
return execute_syscall_noclone(uc, orig_syscall);
}
void handle_sigsys(int sig, siginfo_t *siginfo, void *ucontext)
{
unw_context_t *ctx;
unw_cursor_t cursor;
unw_word_t offset;
ucontext_t *uc = ucontext;
int result;
char funcname[1024];
ctx = ucontext;
if (pthread_getspecific(nested_signal_key))
goto syscall_passthru;
switch (siginfo->si_syscall)
{
case __NR_clone:
if (uc->uc_mcontext.gregs[REG_RDI] & CLONE_VFORK) {
uc->uc_mcontext.gregs[REG_RDI] &= ~CLONE_VM; /* vfork+vm not supported */
}
if (uc->uc_mcontext.gregs[REG_RDI] & CLONE_VM) {
goto syscall_passthru;
}
/* passthrough */
case __NR_execve:
case __NR_execveat:
case __NR_fork:
issue_syscall_request(&noseccomp_syscall_queue, siginfo->si_syscall, uc);
return;
case __NR_rt_sigprocmask:
handle_sigprocmask(siginfo->si_syscall, uc);
return;
case __NR_openat:
default:
break;
}
if (pthread_setspecific(nested_signal_key, (void *)1UL))
abort();
result = unw_init_local2(&cursor, ctx, UNW_INIT_SIGNAL_FRAME);
if (result < 0) {
FAIL("*** handle_sigsys: unw_init_local2 failed: ", unw_strerror(result));
return;
}
{
const char *fname = (const char *)uc->uc_mcontext.gregs[REG_RSI];
struct iovec vecs[] = {
#define VECSTR(x) { (void *)(x), sizeof(x) - 1 }
VECSTR("openat(..., \""),
{ (void *)fname, fname ? strlen(fname) : 0 },
VECSTR("\", ...) called, backtrace:\n"),
#undef VECSTR
};
writev(STDERR_FILENO, vecs, ARRAY_SIZE(vecs));
}
do {
result = unw_get_proc_name(&cursor, funcname, sizeof(funcname), &offset);
if (result >= 0) {
print_backtrace_entry(funcname, offset);
} else {
unw_get_reg(&cursor, UNW_REG_IP, &offset);
print_backtrace_entry(NULL, offset);
}
} while ((result = unw_step(&cursor)) > 0);
if (result < 0) {
FAIL("*** handle_sigsys: unw_init_local2 failed: ", unw_strerror(result > 0 ? -result : result));
return;
}
MESSAGE("end backtrace.\n\n");
pthread_setspecific(nested_signal_key, NULL);
syscall_passthru:
execute_syscall(uc, (unsigned int)siginfo->si_syscall);
}
enum {
LoadArch,
CheckArch,
LoadNr,
CheckX32,
CheckSigaction,
LoadSigcallArg0,
CheckSigsys,
CheckRtsigprocmask,
CheckFork,
CheckClone,
CheckOpenat,
CheckExecve,
CheckExecveat,
LoadIPLo_1,
CheckIPLo_1,
LoadIPHi_1,
CheckIPHi_1,
LoadIPLo_2,
CheckIPLo_2,
LoadIPHi_2,
CheckIPHi_2,
VerdictTrap,
VerdictAllow,
VerdictKillProcess,
VerdictSucceed,
FilterLength,
};
static void init_filter(void)
{
static struct sock_filter filter[FilterLength] = {
[LoadArch ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))),
[CheckArch ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, VerdictAllow - CheckArch - 1),
[LoadNr ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),
[CheckX32 ] = BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, (1 << 30) - 1, VerdictAllow - CheckX32 - 1, 0),
[CheckSigaction ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigaction, LoadSigcallArg0 - CheckSigaction - 1, CheckRtsigprocmask - CheckSigaction - 1),
[LoadSigcallArg0 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, args[0]))),
/* sigaction(SIGSYS, ...) -> no-op */
[CheckSigsys ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SIGSYS, VerdictSucceed - CheckSigsys - 1, CheckRtsigprocmask - CheckSigsys - 1),
[CheckRtsigprocmask] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigprocmask, LoadIPLo_1 - CheckRtsigprocmask - 1, 0),
[CheckFork ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_fork, LoadIPLo_1 - CheckFork - 1, 0),
[CheckClone ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clone, LoadIPLo_1 - CheckClone - 1, 0),
[CheckOpenat ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_openat, LoadIPLo_1 - CheckOpenat - 1, 0),
[CheckExecve ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_execve, LoadIPLo_1 - CheckExecve - 1, 0),
[CheckExecveat ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_execveat, LoadIPLo_1 - CheckExecveat - 1, VerdictAllow - CheckExecveat - 1),
/* PC == seccomp_exempt_address -> allow */
[LoadIPLo_1 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, instruction_pointer))),
[CheckIPLo_1 ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, CheckIPLo_2 - CheckIPLo_1 - 1),
[LoadIPHi_1 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, instruction_pointer) + 4)),
[CheckIPHi_1 ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, VerdictAllow - CheckIPHi_1 - 1, LoadIPLo_2 - CheckIPHi_1 - 1),
[LoadIPLo_2 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, instruction_pointer))),
[CheckIPLo_2 ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, VerdictTrap - CheckIPLo_2 - 1),
[LoadIPHi_2 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, instruction_pointer) + 4)),
[CheckIPHi_2 ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, VerdictAllow - CheckIPHi_2 - 1, VerdictTrap - CheckIPHi_2 - 1),
[VerdictTrap ] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
[VerdictAllow ] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
[VerdictSucceed ] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO),
};
static const struct sock_fprog prog = {
.len = ARRAY_SIZE(filter),
.filter = (struct sock_filter *)filter,
};
static const struct sigaction sigact = { .sa_flags = SA_SIGINFO | SA_NODEFER, .sa_sigaction = handle_sigsys };
unsigned long ip_exempts[2] = {
(unsigned long)&seccomp_exempt_address_clone,
(unsigned long)&seccomp_exempt_address_noclone,
};
memcpy(&filter[CheckIPLo_1].k, (unsigned char *)&ip_exempts[0] + 0, sizeof(unsigned int));
memcpy(&filter[CheckIPHi_1].k, (unsigned char *)&ip_exempts[0] + 4, sizeof(unsigned int));
memcpy(&filter[CheckIPLo_2].k, (unsigned char *)&ip_exempts[1] + 0, sizeof(unsigned int));
memcpy(&filter[CheckIPHi_2].k, (unsigned char *)&ip_exempts[1] + 4, sizeof(unsigned int));
pthread_key_create(&nested_signal_key, NULL);
if (sigaction(SIGSYS, &sigact, NULL)) {
perror("sigaction");
exit(EXIT_FAILURE);
}
if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 0, (struct sock_fprog *)&prog)) {
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 0, (struct sock_fprog *)&prog)) {
perror("seccomp");
exit(EXIT_FAILURE);
}
}
}
void *noseccomp_syscall_thread(void *arg)
{
int result;
struct queue_item *item = NULL;
pid_t orig_tid = gettid();
while (!(result = next_queue_item(&noseccomp_syscall_queue, &item, 0, NULL))) {
struct syscall_request *req = container_of(item, struct syscall_request, queue_item);
execute_syscall(req->uc, req->nr);
if (orig_tid != gettid()) { /* fork() */
restore_context_trampoline(req->uc);
}
ack_queue_item(&noseccomp_syscall_queue, &item, 0, NULL);
}
return 0;
}
__attribute__((constructor))
static void main_ctor(void)
{
/* Ensure access to unwind-unsafe region does not fault */
mlock((void *)seccomp_exempt_address_clone,
(unsigned char *)syscall_return_clone -
(unsigned char *)seccomp_exempt_address_clone);
/* fork()/execve() requests use this thread to bypass seccomp inheritance */
pthread_create(&queue_thread, NULL, &noseccomp_syscall_thread, NULL);
init_filter();
}
CC = gcc
CFLAGS = -O2 -g -shared -fPIC -Wall $(EXTRACFLAGS)
LDFLAGS = -Wl,-z,now,-z,relro $(EXTRALDFLAGS)
LIBS = -lunwind-x86_64 -lpthread
all: libmain.so
libmain.so: main.c
$(CC) $(CFLAGS) -o libmain.so main.c $(LDFLAGS) $(LIBS)
clean:
@rm -f libmain.so
test-thread: libmain.so
@LD_PRELOAD="./libmain.so" python thread-test.py
.PHONY: all clean test-thread
#!/usr/bin/python3
import threading
thr = threading.Timer(1, lambda: print('Timeout'))
thr.start()
print('Thread started')
thr.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment