Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kerneltoast/b0bc90b87b4992ffb1c2e70c7716cc29 to your computer and use it in GitHub Desktop.
Save kerneltoast/b0bc90b87b4992ffb1c2e70c7716cc29 to your computer and use it in GitHub Desktop.
From db4a18bd769a9a237bb764c5bae493502f070089 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@openresty.com>
Date: Fri, 4 Dec 2020 17:11:25 -0800
Subject: [PATCH] linux: use relayfs properly to eliminate mutex_trylock from
print flush
Using a mutex_trylock() in __stp_print_flush() leads to a lot of havoc,
for numerous. Firstly, since __stp_print_flush() can be called from IRQ
context, holding the inode mutex from here would make the mutex owner
become nonsense, since mutex locks can only be held in contexts backed
by the scheduler. Secondly, the mutex_trylock implementation has a
spin_lock() inside of it that leads to two issues: IRQs aren't disabled
when acquiring this spin_lock(), so using it from IRQ context can lead
to a deadlock, and since spin locks can have tracepoints via
lock_acquire(), the spin_lock() can recurse on itself inside a stap
probe and deadlock, like so:
#0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a
#1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e
#2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b
#3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b
#4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba
The reason the mutex_trylock() was needed in the first place was because
staprun doesn't properly use the relayfs API when reading buffers in
non-bulk mode. It tries to read all CPUs' buffers from a single thread,
when it should be reading each CPU's buffer from a thread running on
said CPU in order to utilize relayfs' synchronization guarantees, which
are made by disabling IRQs on the local CPU when a buffer is modified.
This change makes staprun always use per-CPU threads to read print
buffers so that we don't need the mutex_trylock() in the print flush
routine, which resolves a wide variety of serious bugs.
---
runtime/print_flush.c | 127 +++------------------------------
runtime/transport/relay_v2.c | 128 +---------------------------------
runtime/transport/transport.c | 18 -----
runtime/transport/transport.h | 18 -----
staprun/relay.c | 27 +++----
5 files changed, 24 insertions(+), 294 deletions(-)
diff --git a/runtime/print_flush.c b/runtime/print_flush.c
index acd6a32d9..f4d72d30f 100644
--- a/runtime/print_flush.c
+++ b/runtime/print_flush.c
@@ -18,6 +18,7 @@
static void __stp_print_flush(struct _stp_log *log)
{
+ char *bufp = log->buf;
size_t len = log->len;
void *entry = NULL;
@@ -26,126 +27,20 @@ static void __stp_print_flush(struct _stp_log *log)
return;
log->len = 0;
-
dbug_trans(1, "len = %zu\n", len);
-
-#ifdef STP_BULKMODE
-#ifdef NO_PERCPU_HEADERS
- {
- char *bufp = log->buf;
- int inode_locked;
-
- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
- atomic_inc (&_stp_transport_failures);
-#ifndef STP_TRANSPORT_RISKY
- return;
-#endif
- }
-
- while (len > 0) {
- size_t bytes_reserved;
-
- bytes_reserved = _stp_data_write_reserve(len, &entry);
- if (likely(entry && bytes_reserved > 0)) {
- memcpy(_stp_data_entry_data(entry), bufp,
- bytes_reserved);
- _stp_data_write_commit(entry);
- bufp += bytes_reserved;
- len -= bytes_reserved;
- }
- else {
- atomic_inc(&_stp_transport_failures);
- break;
- }
- }
-
- if (inode_locked)
- _stp_transport_unlock_relay_inode();
- }
-
-#else /* !NO_PERCPU_HEADERS */
-
- {
- char *bufp = log->buf;
- struct _stp_trace t = { .sequence = _stp_seq_inc(),
- .pdu_len = len};
+ do {
size_t bytes_reserved;
- int inode_locked;
- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
- atomic_inc (&_stp_transport_failures);
-#ifndef STP_TRANSPORT_RISKY
- return;
-#endif
- }
-
- bytes_reserved = _stp_data_write_reserve(sizeof(struct _stp_trace), &entry);
- if (likely(entry && bytes_reserved > 0)) {
- /* prevent unaligned access by using memcpy() */
- memcpy(_stp_data_entry_data(entry), &t, sizeof(t));
+ bytes_reserved = _stp_data_write_reserve(len, &entry);
+ if (likely(entry && bytes_reserved)) {
+ memcpy(_stp_data_entry_data(entry), bufp,
+ bytes_reserved);
_stp_data_write_commit(entry);
- }
- else {
+ bufp += bytes_reserved;
+ len -= bytes_reserved;
+ } else {
atomic_inc(&_stp_transport_failures);
- goto done;
+ break;
}
-
- while (len > 0) {
- bytes_reserved = _stp_data_write_reserve(len, &entry);
- if (likely(entry && bytes_reserved > 0)) {
- memcpy(_stp_data_entry_data(entry), bufp,
- bytes_reserved);
- _stp_data_write_commit(entry);
- bufp += bytes_reserved;
- len -= bytes_reserved;
- }
- else {
- atomic_inc(&_stp_transport_failures);
- break;
- }
- }
-
- done:
-
- if (inode_locked)
- _stp_transport_unlock_relay_inode();
- }
-#endif /* !NO_PERCPU_HEADERS */
-
-#else /* !STP_BULKMODE */
-
- {
- char *bufp = log->buf;
- int inode_locked;
-
- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
- atomic_inc (&_stp_transport_failures);
-#ifndef STP_TRANSPORT_RISKY
- dbug_trans(0, "discarding %zu bytes of data\n", len);
- return;
-#endif
- }
-
- dbug_trans(1, "calling _stp_data_write...\n");
- while (len > 0) {
- size_t bytes_reserved;
-
- bytes_reserved = _stp_data_write_reserve(len, &entry);
- if (likely(entry && bytes_reserved > 0)) {
- memcpy(_stp_data_entry_data(entry), bufp,
- bytes_reserved);
- _stp_data_write_commit(entry);
- bufp += bytes_reserved;
- len -= bytes_reserved;
- }
- else {
- atomic_inc(&_stp_transport_failures);
- break;
- }
- }
-
- if (inode_locked)
- _stp_transport_unlock_relay_inode();
- }
-#endif /* !STP_BULKMODE */
+ } while (len > 0);
}
diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c
index ff621f71d..8fecf1c7f 100644
--- a/runtime/transport/relay_v2.c
+++ b/runtime/transport/relay_v2.c
@@ -117,24 +117,17 @@ static void __stp_relay_wakeup_readers(struct rchan_buf *buf)
static void __stp_relay_wakeup_timer(stp_timer_callback_parameter_t unused)
{
-#ifdef STP_BULKMODE
int i;
-#endif
if (atomic_read(&_stp_relay_data.wakeup)) {
struct rchan_buf *buf;
atomic_set(&_stp_relay_data.wakeup, 0);
-#ifdef STP_BULKMODE
for_each_possible_cpu(i) {
buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
i);
__stp_relay_wakeup_readers(buf);
}
-#else
- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
- __stp_relay_wakeup_readers(buf);
-#endif
}
if (atomic_read(&_stp_relay_data.transport_state) == STP_TRANSPORT_RUNNING)
@@ -235,55 +228,8 @@ static void _stp_transport_data_fs_stop(void)
atomic_set (&_stp_relay_data.transport_state, STP_TRANSPORT_STOPPED);
del_timer_sync(&_stp_relay_data.timer);
dbug_trans(0, "flushing...\n");
- if (_stp_relay_data.rchan) {
- struct rchan_buf *buf;
-
- /* NB we cannot call relay_flush() directly here since
- * we need to do inode locking ourselves.
- */
-
-#ifdef STP_BULKMODE
- unsigned int i;
- struct rchan *rchan = _stp_relay_data.rchan;
-
- for_each_possible_cpu(i) {
- buf = _stp_get_rchan_subbuf(rchan->buf, i);
- if (buf) {
- struct inode *inode = buf->dentry->d_inode;
-
- /* NB we are in the syscall context which
- * allows sleeping. The following inode
- * locking might sleep. See PR26131. */
- _stp_lock_inode(inode);
-
- /* NB we intentionally avoids calling
- * our own __stp_relay_switch_subbuf()
- * since here we can sleep. */
- relay_switch_subbuf(buf, 0);
-
- _stp_unlock_inode(inode);
- }
- }
-#else /* !STP_BULKMODE */
- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
-
- if (buf != NULL) {
- struct inode *inode = buf->dentry->d_inode;
-
- /* NB we are in the syscall context which allows
- * sleeping. The following inode locking might
- * sleep. See PR26131. */
- _stp_lock_inode(inode);
-
- /* NB we intentionally avoids calling
- * our own __stp_relay_switch_subbuf()
- * since here we can sleep. */
- relay_switch_subbuf(buf, 0);
-
- _stp_unlock_inode(inode);
- }
-#endif
- }
+ if (_stp_relay_data.rchan)
+ relay_flush(_stp_relay_data.rchan);
}
}
@@ -308,9 +254,7 @@ static int _stp_transport_data_fs_init(void)
/* Create "trace" file. */
npages = _stp_subbuf_size * _stp_nsubbufs;
-#ifdef STP_BULKMODE
npages *= num_online_cpus();
-#endif
npages >>= PAGE_SHIFT;
si_meminfo(&si);
#define MB(i) (unsigned long)((i) >> (20 - PAGE_SHIFT))
@@ -347,9 +291,7 @@ static int _stp_transport_data_fs_init(void)
{
u64 relay_mem;
relay_mem = _stp_subbuf_size * _stp_nsubbufs;
-#ifdef STP_BULKMODE
relay_mem *= num_online_cpus();
-#endif
_stp_allocated_net_memory += relay_mem;
_stp_allocated_memory += relay_mem;
}
@@ -386,12 +328,7 @@ _stp_data_write_reserve(size_t size_request, void **entry)
return -EINVAL;
buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
-#ifdef STP_BULKMODE
- smp_processor_id()
-#else
- 0
-#endif
- );
+ smp_processor_id());
if (unlikely(buf->offset + size_request > buf->chan->subbuf_size)) {
size_request = __stp_relay_switch_subbuf(buf, size_request);
if (!size_request)
@@ -414,62 +351,3 @@ static int _stp_data_write_commit(void *entry)
/* Nothing to do here. */
return 0;
}
-
-static noinline int _stp_transport_trylock_relay_inode(void)
-{
- unsigned i;
- struct rchan_buf *buf;
- struct inode *inode;
-#ifdef DEBUG_TRANS
- cycles_t begin;
-#endif
-
- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
-#ifdef STP_BULKMODE
- smp_processor_id()
-#else
- 0
-#endif
- );
- if (buf == NULL)
- return 0;
-
- inode = buf->dentry->d_inode;
-
-#ifdef DEBUG_TRANS
- begin = get_cycles();
-#endif
-
- /* NB this bounded spinlock is needed for stream mode. it is observed
- * that almost all of the iterations needed are less than 50K iterations
- * or about 300K cycles.
- */
- for (i = 0; i < 50 * 1000; i++) {
- if (_stp_trylock_inode(inode)) {
- dbug_trans(3, "got inode lock: i=%u: cycles: %llu", i,
- get_cycles() - begin);
- return 1;
- }
- }
-
- dbug_trans(0, "failed to get inode lock: i=%u: cycles: %llu", i,
- get_cycles() - begin);
- return 0;
-}
-
-static void _stp_transport_unlock_relay_inode(void)
-{
- struct rchan_buf *buf;
-
- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
-#ifdef STP_BULKMODE
- smp_processor_id()
-#else
- 0
-#endif
- );
- if (buf == NULL)
- return;
-
- _stp_unlock_inode(buf->dentry->d_inode);
-}
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c
index 96426eb7b..207fc40dc 100644
--- a/runtime/transport/transport.c
+++ b/runtime/transport/transport.c
@@ -49,7 +49,6 @@ static int _stp_probes_started = 0;
* transport state flag is atomic. */
static atomic_t _stp_transport_state = ATOMIC_INIT(_STP_TS_UNINITIALIZED);
-static inline int _stp_trylock_inode(struct inode *inode);
static inline void _stp_lock_inode(struct inode *inode);
static inline void _stp_unlock_inode(struct inode *inode);
@@ -643,23 +642,6 @@ err0:
return -1;
}
-/* returns 1 when the lock is successfully acquired, 0 otherwise. */
-static inline int _stp_trylock_inode(struct inode *inode)
-{
-#ifdef STAPCONF_INODE_RWSEM
- return inode_trylock(inode);
-#else
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
- return mutex_trylock(&inode->i_mutex);
-#else
- /* NB down_trylock() uses a different convention where 0 means
- * the lock is successfully acquired.
- */
- return !down_trylock(&inode->i_sem);
-#endif
-#endif
-}
-
static inline void _stp_lock_inode(struct inode *inode)
{
#ifdef STAPCONF_INODE_RWSEM
diff --git a/runtime/transport/transport.h b/runtime/transport/transport.h
index 51723b7f5..cc09fc0ae 100644
--- a/runtime/transport/transport.h
+++ b/runtime/transport/transport.h
@@ -98,24 +98,6 @@ enum _stp_transport_state {
*/
static enum _stp_transport_state _stp_transport_get_state(void);
-/*
- * _stp_transport_trylock_relay_inode
- *
- * This function locks the relay file inode to protect against relay readers
- * (i.e., staprun/stapio).
- * Returns whether the lock is successfully obtained.
- */
-static noinline int _stp_transport_trylock_relay_inode(void);
-
-/*
- * _stp_transport_unlock_relay_inode
- *
- * This function releases the lock obtained by
- * _stp_transport_trylock_relay_inode.
- * should only call this when the lock is indeed obtained.
- */
-static void _stp_transport_unlock_relay_inode(void);
-
/*
* _stp_transport_data_fs_init
*
diff --git a/staprun/relay.c b/staprun/relay.c
index 2f5f2e06a..c76e76719 100644
--- a/staprun/relay.c
+++ b/staprun/relay.c
@@ -131,6 +131,7 @@ static void *reader_thread(void *data)
sigset_t sigs;
off_t wsize = 0;
int fnum = 0;
+ cpu_set_t cpu_mask;
sigemptyset(&sigs);
sigaddset(&sigs,SIGUSR2);
@@ -139,21 +140,18 @@ static void *reader_thread(void *data)
sigfillset(&sigs);
sigdelset(&sigs,SIGUSR2);
- if (bulkmode) {
- cpu_set_t cpu_mask;
- CPU_ZERO(&cpu_mask);
- CPU_SET(cpu, &cpu_mask);
- if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )
- _perr("sched_setaffinity");
+ CPU_ZERO(&cpu_mask);
+ CPU_SET(cpu, &cpu_mask);
+ if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )
+ _perr("sched_setaffinity");
#ifdef NEED_PPOLL
- /* Without a real ppoll, there is a small race condition that could */
- /* block ppoll(). So use a timeout to prevent that. */
- timeout->tv_sec = 10;
- timeout->tv_nsec = 0;
+ /* Without a real ppoll, there is a small race condition that could */
+ /* block ppoll(). So use a timeout to prevent that. */
+ timeout->tv_sec = 10;
+ timeout->tv_nsec = 0;
#else
- timeout = NULL;
+ timeout = NULL;
#endif
- }
if (reader_timeout_ms && timeout) {
timeout->tv_sec = reader_timeout_ms / 1000;
@@ -358,11 +356,6 @@ int init_relayfs(void)
_err("couldn't open %s.\n", buf);
return -1;
}
- if (ncpus > 1 && bulkmode == 0) {
- _err("ncpus=%d, bulkmode = %d\n", ncpus, bulkmode);
- _err("This is inconsistent! Please file a bug report. Exiting now.\n");
- return -1;
- }
/* PR7097 */
if (load_only)
--
2.29.2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment