kerneltoast/0001-runtime-fix-print-races-in-IRQ-context-and-during-pr.patch Secret

## 0001-runtime-fix-print-races-in-IRQ-context-and-during-pr.patch
From bdf8d32c87c224da91a54793b1ac46e9a1c054a0 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@openresty.com>
Date: Tue, 1 Dec 2020 19:30:03 -0800
Subject: [PATCH] runtime: fix print races in IRQ context and during print
 cleanup

Prints can race when there's a print called from IRQ context or a print
called while print cleanup takes place, which can lead to garbled print
messages, out-of-bounds memory accesses, and memory use-after-free. This
is one example of racy modification of the print buffer len in IRQ
context which caused a panic due to an out-of-bounds memory access:

BUG: unable to handle kernel paging request at ffffe8ffff621000
IP: [<ffffffffc05da0f3>] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533]
PGD 174b90067 PUD 174b8f067 PMD 174b93067 PTE 0
Oops: 0002 [#1] SMP
CPU: 12 PID: 3468 Comm: cat Kdump: loaded Tainted: G           OE  ------------   3.10.0-1127.19.1.el7.x86_64.debug #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
task: ffff88001f4f0000 ti: ffff88004ea5c000 task.ti: ffff88004ea5c000
RIP: 0010:[<ffffffffc05da0f3>]  [<ffffffffc05da0f3>] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533]
RSP: 0018:ffff88004ea5f9a8  EFLAGS: 00010082
RAX: ffffe8ffff621001 RBX: ffffe8ffff620ff2 RCX: fffffffffffffffe
RDX: 000000000000006e RSI: ffffffffffffffff RDI: ffffc90002c23730
RBP: ffff88004ea5fa28 R08: 00000000ffffffff R09: 0000000000000073
R10: ffffc90002c243d7 R11: 0000000000000001 R12: ffffc90002c2373f
R13: ffffe8ffff621004 R14: 0000000000000012 R15: 00000000fffffffe
FS:  00007f8a9b1d4740(0000) GS:ffff880179e00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffe8ffff621000 CR3: 00000000b3e3c000 CR4: 0000000000360fe0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 [<ffffffff8103eb89>] ? sched_clock+0x9/0x10
 [<ffffffff8114036f>] ? lock_release_holdtime.part.30+0xf/0x1a0
 [<ffffffffc05dcb80>] function___global_trace__overload_0+0x5b0/0x1220 [stap_2c44636dfda18135ca3012a752599da6_13_533]
 [<ffffffffc05d8993>] ? stp_lock_probe+0x53/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533]
 [<ffffffff8188d879>] ? kretprobe_trampoline_holder+0x9/0x9
 [<ffffffffc05e0662>] probe_7118+0x82/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533]
 [<ffffffffc05de866>] enter_kretprobe_common+0x256/0x490 [stap_2c44636dfda18135ca3012a752599da6_13_533]
 [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
 [<ffffffffc05dead0>] enter_kretprobe_probe+0x10/0x20 [stap_2c44636dfda18135ca3012a752599da6_13_533]
 [<ffffffff8188e1d8>] trampoline_handler+0x148/0x220
 [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
 [<ffffffff8188d89e>] kretprobe_trampoline+0x25/0x57
 [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
 [<ffffffff8188d879>] kretprobe_trampoline_holder+0x9/0x9
 [<ffffffff81384702>] ? security_inode_permission+0x22/0x30
 [<ffffffff813489a0>] ? sysctl_head_finish+0x50/0x50
 [<ffffffff812ac11d>] vfs_open+0x5d/0xb0
 [<ffffffff812bb74a>] ? may_open+0x5a/0x120
 [<ffffffff812c0af5>] do_last+0x285/0x15b0
 [<ffffffff812bf18e>] ? link_path_walk+0x27e/0x8c0
 [<ffffffff812c1ef0>] path_openat+0xd0/0x5d0
 [<ffffffff8107a7f3>] ? kvm_clock_read+0x33/0x40
 [<ffffffff812c38ad>] do_filp_open+0x4d/0xb0
 [<ffffffff81889497>] ? _raw_spin_unlock+0x27/0x40
 [<ffffffff812d5a9b>] ? __alloc_fd+0xfb/0x270
 [<ffffffff812ad784>] do_sys_open+0x124/0x220
 [<ffffffff812ad89e>] SyS_open+0x1e/0x20
 [<ffffffff8188d879>] kretprobe_trampoline_holder+0x9/0x9

This patch resolves the IRQ print races by disabling IRQs on the local
CPU when accessing said CPU's print buffer, and resolves the cleanup
races with a lock. We also protect against data corruption and panics
from prints inside NMIs now by checking if the current CPU was accessing
the log buffer when an NMI fired; in this case, the NMI's prints will be
dropped, as there is no way to safely service them without creating a
dedicated log buffer for them. This is achieved by forbidding reentrancy
with respect to _stp_print_trylock_irqsave() when the runtime context
isn't held. Reentrancy is otherwise allowed when the runtime context is
held because the runtime context provides reentrancy protection.
---
 runtime/linux/io.c            |  44 ++++---
 runtime/linux/print.c         | 238 ++++++++++++++++++++++++----------
 runtime/print.h               |   3 +
 runtime/print_flush.c         |  56 +-------
 runtime/stack.c               |  34 +++--
 runtime/transport/transport.c |   2 +-
 runtime/vsprintf.c            |  18 ++-
 translate.cxx                 |  20 ++-
 8 files changed, 266 insertions(+), 149 deletions(-)

diff --git a/runtime/linux/io.c b/runtime/linux/io.c
index 74a032c52..48adcecd7 100644
--- a/runtime/linux/io.c
+++ b/runtime/linux/io.c
@@ -20,9 +20,6 @@

 #define WARN_STRING "WARNING: "
 #define ERR_STRING "ERROR: "
-#if (STP_LOG_BUF_LEN < 10) /* sizeof(WARN_STRING) */
-#error "STP_LOG_BUF_LEN is too short"
-#endif

 enum code { INFO=0, WARN, ERROR, DBUG };

@@ -31,25 +28,37 @@ static void _stp_vlog (enum code type, const char *func, int line, const char *f

 static void _stp_vlog (enum code type, const char *func, int line, const char *fmt, va_list args)
 {
+	struct _stp_log *log;
+	unsigned long flags;
+	size_t bytes_avail;
 	int num;
-	char *buf = per_cpu_ptr(Stp_lbuf, get_cpu());
+	char *buf;
 	int start = 0;

+	if (!_stp_print_trylock_irqsave(&flags))
+		return;
+
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+	bytes_avail = STP_BUFFER_SIZE - log->len;
+	if (unlikely(!bytes_avail))
+		goto err_unlock;
+
+	buf = &log->buf[log->len];
 	if (type == DBUG) {
-		start = _stp_snprintf(buf, STP_LOG_BUF_LEN, "%s:%d: ", func, line);
+		start = _stp_snprintf(buf, bytes_avail, "%s:%d: ", func, line);
 	} else if (type == WARN) {
-		/* This strcpy() is OK, since we know STP_LOG_BUF_LEN
-		 * is > sizeof(WARN_STRING). */
-		strcpy (buf, WARN_STRING);
-		start = sizeof(WARN_STRING) - 1;
+		strncpy(buf, WARN_STRING, bytes_avail);
+		start = min(bytes_avail, sizeof(WARN_STRING) - 1);
 	} else if (type == ERROR) {
-		/* This strcpy() is OK, since we know STP_LOG_BUF_LEN
-		 * is > sizeof(ERR_STRING) (which is < sizeof(WARN_STRING). */
-		strcpy (buf, ERR_STRING);
-		start = sizeof(ERR_STRING) - 1;
+		strncpy(buf, ERR_STRING, bytes_avail);
+		start = min(bytes_avail, sizeof(ERR_STRING) - 1);
 	}

-	num = vscnprintf (buf + start, STP_LOG_BUF_LEN - start - 1, fmt, args);
+	bytes_avail -= start;
+	if (unlikely(!bytes_avail))
+		goto err_unlock;
+
+	num = vscnprintf(buf + start, bytes_avail - 1, fmt, args);
 	if (num + start) {
 		if (buf[num + start - 1] != '\n') {
 			buf[num + start] = '\n';
@@ -66,12 +75,13 @@ static void _stp_vlog (enum code type, const char *func, int line, const char *f
 		if (type != DBUG) {
 			_stp_ctl_send(STP_OOB_DATA, buf, start + num + 1);
 		} else {
-			_stp_print(buf);
-			_stp_print_flush();
+			log->len += start + num + 1;
+			__stp_print_flush(log);
 		}
 #endif
 	}
-	put_cpu();
+err_unlock:
+	_stp_print_unlock_irqrestore(&flags);
 }

 /** Prints warning.
diff --git a/runtime/linux/print.c b/runtime/linux/print.c
index 777bca8b0..5bbe66949 100644
--- a/runtime/linux/print.c
+++ b/runtime/linux/print.c
@@ -35,84 +35,181 @@
  * @{
  */

-typedef struct __stp_pbuf {
-	uint32_t len;			/* bytes used in the buffer */
+struct _stp_log {
+	unsigned int len; /* Bytes used in the buffer */
 	char buf[STP_BUFFER_SIZE];
-} _stp_pbuf;
+	atomic_t reentrancy_lock;
+};
+#include "print_flush.c"

-static void *Stp_pbuf = NULL;
+static struct _stp_log *_stp_log_pcpu;
+static bool _stp_log_stop;

-/** private buffer for _stp_vlog() */
-#ifndef STP_LOG_BUF_LEN
-#define STP_LOG_BUF_LEN 256
-#endif
+/*
+ * An atomic counter is used to synchronize every possible print buffer usage
+ * with the _stp_print_cleanup() function. The cleanup function sets the counter
+ * to INT_MAX after waiting for everything using the print buffer to finish. We
+ * cannot use a lock primitive to implement this because lock_acquire() contains
+ * tracepoints. We instead use _stp_print_ctr as if it were a read-write lock.
+ */
+static atomic_t _stp_print_ctr = ATOMIC_INIT(0);
+
+/*
+ * This disables IRQs to make per-CPU print buffer accesses atomic. There is a
+ * reentrancy protection mechanism specifically for NMIs, since they can violate
+ * our atomic guarantee. Reentrancy is otherwise allowed within code sections
+ * that have the runtime context held (via _stp_runtime_entryfn_get_context()).
+ */
+static bool _stp_print_trylock_irqsave(unsigned long *flags)
+{
+	bool context_held = false;
+	struct _stp_log *log;
+
+	if (!atomic_add_unless(&_stp_print_ctr, 1, INT_MAX))
+		return false;
+
+	/*
+	 * Check the per-CPU reentrancy lock for contention, unless the runtime
+	 * context is already held, in which case we already have reentrancy
+	 * protection. Otherwise, if the reentrancy lock is contented, that
+	 * means we're either inside an NMI that fired while the current CPU was
+	 * accessing the log buffer, or something is trying to nest calls to
+	 * _stp_print_trylock_irqsave(). Our only choice is to reject the log
+	 * access attempt in this case because log buffer corruption and panics
+	 * could ensue if we're inside an NMI.
+	 */
+	if (atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX)) {
+		struct context *c = _stp_runtime_get_context();
+		context_held = c && atomic_read(&c->busy);
+		atomic_dec(&_stp_contexts_busy_ctr);
+	}
+
+	local_irq_save(*flags);
+
+	/* Fall back onto the reentrancy lock if the context isn't held */
+	if (!context_held) {
+		log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+		if (atomic_cmpxchg(&log->reentrancy_lock, 0, 1))
+			goto irqrestore;
+	}
+
+	return true;

-typedef char _stp_lbuf[STP_LOG_BUF_LEN];
-static void *Stp_lbuf = NULL;
+irqrestore:
+	local_irq_restore(*flags);
+print_unlock:
+	atomic_dec(&_stp_print_ctr);
+	return false;
+}
+
+static void _stp_print_unlock_irqrestore(unsigned long *flags)
+{
+	bool context_held = false;
+	struct _stp_log *log;
+
+	if (atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX)) {
+		struct context *c = _stp_runtime_get_context();
+		context_held = c && atomic_read(&c->busy);
+		atomic_dec(&_stp_contexts_busy_ctr);
+	}
+
+	if (!context_held) {
+		log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+		atomic_set(&log->reentrancy_lock, 0);
+	}
+
+	local_irq_restore(*flags);
+	atomic_dec(&_stp_print_ctr);
+}

 /* create percpu print and io buffers */
 static int _stp_print_init (void)
 {
-	Stp_pbuf = _stp_alloc_percpu(sizeof(_stp_pbuf));
-	if (unlikely(Stp_pbuf == 0))
-		return -1;
-
-	/* now initialize IO buffer used in io.c */
-	Stp_lbuf = _stp_alloc_percpu(sizeof(_stp_lbuf));
-	if (unlikely(Stp_lbuf == 0)) {
-		_stp_free_percpu(Stp_pbuf);
-		return -1;
+	unsigned int cpu;
+
+	_stp_log_pcpu = _stp_alloc_percpu(sizeof(*_stp_log_pcpu));
+	if (!_stp_log_pcpu)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu);
+
+		log->reentrancy_lock = (atomic_t)ATOMIC_INIT(0);
 	}
 	return 0;
 }

 static void _stp_print_cleanup (void)
 {
-	if (Stp_pbuf)
-		_stp_free_percpu(Stp_pbuf);
-	if (Stp_lbuf)
-		_stp_free_percpu(Stp_lbuf);
-}
+	unsigned int cpu;

-#include "print_flush.c"
+	/* Wait for the loggers to finish modifying the print buffers */
+	while (atomic_cmpxchg(&_stp_print_ctr, 0, INT_MAX))
+		cpu_relax();
+
+	for_each_possible_cpu(cpu) {
+		struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu);
+
+		/*
+		 * Flush anything that could be left in the print buffer. It is
+		 * safe to do this without any kind of synchronization mechanism
+		 * because nothing is using this print buffer anymore.
+		 */
+		__stp_print_flush(log);
+	}
+
+	_stp_free_percpu(_stp_log_pcpu);
+}

 static inline void _stp_print_flush(void)
 {
-	stp_print_flush(per_cpu_ptr(Stp_pbuf, smp_processor_id()));
+	struct _stp_log *log;
+	unsigned long flags;
+
+	if (!_stp_print_trylock_irqsave(&flags))
+		return;
+
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+	__stp_print_flush(log);
+	_stp_print_unlock_irqrestore(&flags);
 }
+
 #ifndef STP_MAXBINARYARGS
 #define STP_MAXBINARYARGS 127
 #endif


-/** Reserves space in the output buffer for direct I/O.
+/** Reserves space in the output buffer for direct I/O. Must be called with
+ * _stp_print_trylock_irqsave() held.
  */
 static void * _stp_reserve_bytes (int numbytes)
 {
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
-	int size = STP_BUFFER_SIZE - pb->len;
-	void * ret;
+	struct _stp_log *log;
+	char *ret;

 	if (unlikely(numbytes == 0 || numbytes > STP_BUFFER_SIZE))
 		return NULL;

-	if (unlikely(numbytes > size))
-		_stp_print_flush();
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+	if (unlikely(numbytes > (STP_BUFFER_SIZE - log->len)))
+		__stp_print_flush(log);

-	ret = pb->buf + pb->len;
-	pb->len += numbytes;
+	ret = &log->buf[log->len];
+	log->len += numbytes;
 	return ret;
 }


 static void _stp_unreserve_bytes (int numbytes)
 {
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
+	struct _stp_log *log;

-	if (unlikely(numbytes == 0 || numbytes > pb->len))
+	if (unlikely(numbytes == 0))
 		return;

-	pb->len -= numbytes;
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+	if (numbytes <= log->len)
+		log->len -= numbytes;
 }

 /** Write 64-bit args directly into the output stream.
@@ -123,22 +220,25 @@ static void _stp_unreserve_bytes (int numbytes)
  */
 static void _stp_print_binary (int num, ...)
 {
+	unsigned long flags;
 	va_list vargs;
 	int i;
 	int64_t *args;
-
+
 	if (unlikely(num > STP_MAXBINARYARGS))
 		num = STP_MAXBINARYARGS;

-	args = _stp_reserve_bytes(num * sizeof(int64_t));
+	if (!_stp_print_trylock_irqsave(&flags))
+		return;

-	if (likely(args != NULL)) {
+	args = _stp_reserve_bytes(num * sizeof(int64_t));
+	if (args) {
 		va_start(vargs, num);
-		for (i = 0; i < num; i++) {
+		for (i = 0; i < num; i++)
 			args[i] = va_arg(vargs, int64_t);
-		}
 		va_end(vargs);
 	}
+	_stp_print_unlock_irqrestore(&flags);
 }

 /** Print into the print buffer.
@@ -149,6 +249,7 @@ static void _stp_print_binary (int num, ...)
 static void _stp_printf (const char *fmt, ...)
 {
 	va_list args;
+
 	va_start(args, fmt);
 	_stp_vsnprintf(NULL, 0, fmt, args);
 	va_end(args);
@@ -160,37 +261,38 @@ static void _stp_printf (const char *fmt, ...)

 static void _stp_print (const char *str)
 {
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
-	char *end = pb->buf + STP_BUFFER_SIZE;
-	char *ptr = pb->buf + pb->len;
-	char *instr = (char *)str;
-
-	while (ptr < end && *instr)
-		*ptr++ = *instr++;
-
-	/* Did loop terminate due to lack of buffer space? */
-	if (unlikely(*instr)) {
-		/* Don't break strings across subbufs. */
-		/* Restart after flushing. */
-		_stp_print_flush();
-		end = pb->buf + STP_BUFFER_SIZE;
-		ptr = pb->buf + pb->len;
-		instr = (char *)str;
-		while (ptr < end && *instr)
-			*ptr++ = *instr++;
+	struct _stp_log *log;
+	unsigned long flags;
+	unsigned int i;
+
+	if (!_stp_print_trylock_irqsave(&flags))
+		return;
+
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+	while (1) {
+		for (i = log->len; i < STP_BUFFER_SIZE && *str; i++)
+			log->buf[i] = *str++;
+		log->len = i;
+		if (likely(!*str))
+			break;
+		__stp_print_flush(log);
 	}
-	pb->len = ptr - pb->buf;
+	_stp_print_unlock_irqrestore(&flags);
 }

 static void _stp_print_char (const char c)
 {
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
-	int size = STP_BUFFER_SIZE - pb->len;
-	if (unlikely(1 >= size))
-		_stp_print_flush();
-
-	pb->buf[pb->len] = c;
-	pb->len ++;
+	struct _stp_log *log;
+	unsigned long flags;
+
+	if (!_stp_print_trylock_irqsave(&flags))
+		return;
+
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+	if (unlikely(log->len == STP_BUFFER_SIZE))
+		__stp_print_flush(log);
+	log->buf[log->len++] = c;
+	_stp_print_unlock_irqrestore(&flags);
 }

 static void _stp_print_kernel_info(char *sname, char *vstr, int ctx, int num_probes)
diff --git a/runtime/print.h b/runtime/print.h
index ede71f033..ffdea594d 100644
--- a/runtime/print.h
+++ b/runtime/print.h
@@ -10,6 +10,9 @@
 #ifndef _STP_PRINT_H_
 #define _STP_PRINT_H_

+/* The lock must be held with IRQs disabled to do any printing */
+static bool _stp_print_trylock_irqsave(unsigned long *flags);
+static void _stp_print_unlock_irqrestore(unsigned long *flags);
 static int _stp_print_init(void);
 static void _stp_print_cleanup(void);
 static void *_stp_reserve_bytes(int numbytes);
diff --git a/runtime/print_flush.c b/runtime/print_flush.c
index cf40a2645..acd6a32d9 100644
--- a/runtime/print_flush.c
+++ b/runtime/print_flush.c
@@ -13,40 +13,31 @@
  * is filled, or this is called. This MUST be called before returning
  * from a probe or accumulated output in the print buffer will be lost.
  *
- * @note Preemption must be disabled to use this.
+ * @note Interrupts must be disabled to use this.
  */

-static STP_DEFINE_SPINLOCK(_stp_print_lock);
-
-void stp_print_flush(_stp_pbuf *pb)
+static void __stp_print_flush(struct _stp_log *log)
 {
-	size_t len = pb->len;
+	size_t len = log->len;
 	void *entry = NULL;

 	/* check to see if there is anything in the buffer */
 	if (likely(len == 0))
 		return;

-	pb->len = 0;
-
-	if (unlikely(_stp_transport_get_state() != STP_TRANSPORT_RUNNING))
-		return;
+	log->len = 0;

 	dbug_trans(1, "len = %zu\n", len);

 #ifdef STP_BULKMODE
 #ifdef NO_PERCPU_HEADERS
 	{
-		struct context* __restrict__ c = NULL;
-		char *bufp = pb->buf;
+		char *bufp = log->buf;
 		int inode_locked;

-		c = _stp_runtime_entryfn_get_context();
-
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
 			atomic_inc (&_stp_transport_failures);
 #ifndef STP_TRANSPORT_RISKY
-			_stp_runtime_entryfn_put_context(c);
 			return;
 #endif
 		}
@@ -70,26 +61,20 @@ void stp_print_flush(_stp_pbuf *pb)

 		if (inode_locked)
 			_stp_transport_unlock_relay_inode();
-
-		_stp_runtime_entryfn_put_context(c);
 	}

 #else  /* !NO_PERCPU_HEADERS */

 	{
-		struct context* __restrict__ c = NULL;
-		char *bufp = pb->buf;
+		char *bufp = log->buf;
 		struct _stp_trace t = {	.sequence = _stp_seq_inc(),
 					.pdu_len = len};
 		size_t bytes_reserved;
 		int inode_locked;

-		c = _stp_runtime_entryfn_get_context();
-
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
 			atomic_inc (&_stp_transport_failures);
 #ifndef STP_TRANSPORT_RISKY
-			_stp_runtime_entryfn_put_context(c);
 			return;
 #endif
 		}
@@ -124,48 +109,24 @@ void stp_print_flush(_stp_pbuf *pb)

 		if (inode_locked)
 			_stp_transport_unlock_relay_inode();
-
-		_stp_runtime_entryfn_put_context(c);
 	}
 #endif /* !NO_PERCPU_HEADERS */

 #else  /* !STP_BULKMODE */

 	{
-		unsigned long flags;
-		struct context* __restrict__ c = NULL;
-		char *bufp = pb->buf;
+		char *bufp = log->buf;
 		int inode_locked;

-		/* Prevent probe reentrancy on _stp_print_lock.
-		 *
-		 * Since stp_print_flush may be called from probe context, we
-		 * have to make sure that its lock, _stp_print_lock, can't
-		 * possibly be held outside probe context too.  We ensure this
-		 * by grabbing the context here, so any probe triggered by this
-		 * region will appear reentrant and be skipped rather than
-		 * deadlock.  Failure to get_context just means we're already
-		 * in a probe, which is fine.
-		 *
-		 * (see also _stp_ctl_send for a similar situation)
-                 *
-                 * A better solution would be to replace this
-                 * concurrency-control-laden effort with a lockless
-                 * algorithm.
-		 */
-		c = _stp_runtime_entryfn_get_context();
-
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
 			atomic_inc (&_stp_transport_failures);
 #ifndef STP_TRANSPORT_RISKY
 			dbug_trans(0, "discarding %zu bytes of data\n", len);
-			_stp_runtime_entryfn_put_context(c);
 			return;
 #endif
 		}

 		dbug_trans(1, "calling _stp_data_write...\n");
-		stp_spin_lock_irqsave(&_stp_print_lock, flags);
 		while (len > 0) {
 			size_t bytes_reserved;

@@ -182,12 +143,9 @@ void stp_print_flush(_stp_pbuf *pb)
 			    break;
 			}
 		}
-		stp_spin_unlock_irqrestore(&_stp_print_lock, flags);

 		if (inode_locked)
 			_stp_transport_unlock_relay_inode();
-
-		_stp_runtime_entryfn_put_context(c);
 	}
 #endif /* !STP_BULKMODE */
 }
diff --git a/runtime/stack.c b/runtime/stack.c
index 241ccf793..da23d4395 100644
--- a/runtime/stack.c
+++ b/runtime/stack.c
@@ -690,13 +690,20 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c,
 	 * then call _stp_stack_print,
 	 * then copy the result into the output string
 	 * and clear the print buffer. */
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
-	_stp_print_flush();
+	struct _stp_log *log;
+	unsigned long flags;

-	_stp_stack_kernel_print(c, sym_flags);
+	if (!_stp_print_trylock_irqsave(&flags)) {
+		*str = '\0';
+		return;
+	}

-	strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len);
-	pb->len = 0;
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+	__stp_print_flush(log);
+	_stp_stack_kernel_print(c, sym_flags);
+	strlcpy(str, log->buf, min_t(int, size, log->len));
+	log->len = 0;
+	_stp_print_unlock_irqrestore(&flags);
 }

 static void _stp_stack_user_sprint(char *str, int size, struct context* c,
@@ -707,13 +714,20 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c,
 	 * then call _stp_stack_print,
 	 * then copy the result into the output string
 	 * and clear the print buffer. */
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
-	_stp_print_flush();
+	struct _stp_log *log;
+	unsigned long flags;

-	_stp_stack_user_print(c, sym_flags);
+	if (!_stp_print_trylock_irqsave(&flags)) {
+		*str = '\0';
+		return;
+	}

-	strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len);
-	pb->len = 0;
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
+	__stp_print_flush(log);
+	_stp_stack_user_print(c, sym_flags);
+	strlcpy(str, log->buf, min_t(int, size, log->len));
+	log->len = 0;
+	_stp_print_unlock_irqrestore(&flags);
 }

 #endif /* _STACK_C_ */
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c
index 57955334b..44e69b68c 100644
--- a/runtime/transport/transport.c
+++ b/runtime/transport/transport.c
@@ -540,8 +540,8 @@ static void _stp_transport_close(void)
 		   current->pid);
 	_stp_cleanup_and_exit(0);
 	_stp_unregister_ctl_channel();
+	_stp_print_cleanup(); /* Requires the transport, so free this first */
 	_stp_transport_fs_close();
-	_stp_print_cleanup();	/* free print buffers */
 	_stp_mem_debug_done();

 	dbug_trans(1, "---- CLOSED ----\n");
diff --git a/runtime/vsprintf.c b/runtime/vsprintf.c
index 28fd18f16..417d9f7f3 100644
--- a/runtime/vsprintf.c
+++ b/runtime/vsprintf.c
@@ -542,6 +542,8 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 				   number of chars for from string */
 	int qualifier;		/* 'h', 'l', or 'L' for integer fields */
 	int num_bytes = 0;
+	unsigned long irqflags = 0;
+	bool got_print_lock = false;

 	/* Reject out-of-range values early */
 	if (unlikely((int) size < 0))
@@ -724,11 +726,14 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 	    num_bytes = STP_BUFFER_SIZE;
 	  }

+	  if (!_stp_print_trylock_irqsave(&irqflags))
+	    return 0;
 	  str = (char*)_stp_reserve_bytes(num_bytes);
 	  if (str == NULL) {
 	    _stp_error("Couldn't reserve any print buffer space\n");
-	    return 0;
+	    goto err_unlock;
 	  }
+	  got_print_lock = true;
 	  size = num_bytes;
 	  end = str + size - 1;

@@ -820,8 +825,10 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 					field_width, precision,
 					*fmt, flags);
 			if (unlikely(str == NULL)) {
-				if (num_bytes > 0)
+				if (num_bytes > 0) {
 					_stp_unreserve_bytes(num_bytes);
+					goto err_unlock;
+				}
 				return 0;
 			}
 			continue;
@@ -923,7 +930,14 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
                   /* don't write out a null byte if the buf size is zero */
                   *end = '\0';
 	}
+
+	if (got_print_lock)
+		_stp_print_unlock_irqrestore(&irqflags);
 	return str-buf;
+
+err_unlock:
+	_stp_print_unlock_irqrestore(&irqflags);
+	return 0;
 }

 #endif /* _VSPRINTF_C_ */
diff --git a/translate.cxx b/translate.cxx
index 53f1d0725..f0195486c 100644
--- a/translate.cxx
+++ b/translate.cxx
@@ -1354,6 +1354,9 @@ c_unparser::emit_compiled_printfs ()
       o->newline() << "unsigned long ptr_value;";
       o->newline() << "int num_bytes;";

+      if (print_to_stream)
+	  o->newline() << "unsigned long irqflags;";
+
       o->newline() << "(void) width;";
       o->newline() << "(void) precision;";
       o->newline() << "(void) ptr_value;";
@@ -1452,7 +1455,9 @@ c_unparser::emit_compiled_printfs ()
 	    }

 	  o->newline() << "num_bytes = clamp(num_bytes, 0, STP_BUFFER_SIZE);";
-	  o->newline() << "str = (char*)_stp_reserve_bytes(num_bytes);";
+	  o->newline() << "if (!_stp_print_trylock_irqsave(&irqflags))";
+	  o->newline(1) << "return;";
+	  o->newline(-1) << "str = (char*)_stp_reserve_bytes(num_bytes);";
 	  o->newline() << "end = str ? str + num_bytes - 1 : 0;";
         }
       else // !print_to_stream
@@ -1547,8 +1552,14 @@ c_unparser::emit_compiled_printfs ()
 	      o->newline() << "if (unlikely(str == NULL)) {";
 	      o->indent(1);
 	      if (print_to_stream)
+                {
 		  o->newline() << "_stp_unreserve_bytes(num_bytes);";
-	      o->newline() << "return;";
+	          o->newline() << "goto err_unlock;";
+                }
+              else
+                {
+	          o->newline() << "return;";
+                }
 	      o->newline(-1) << "}";
 	      break;

@@ -1575,6 +1586,11 @@ c_unparser::emit_compiled_printfs ()

       o->newline(-1) << "}";

+      if (print_to_stream)
+        {
+          o->newline(-1) << "err_unlock:";
+          o->newline(1) << "_stp_print_unlock_irqrestore(&irqflags);";
+        }
       o->newline(-1) << "}";
     }
   o->newline() << "#endif // STP_LEGACY_PRINT";
--
2.29.2