Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kerneltoast/d5be6b0cab32f1887f45fb763d0529af to your computer and use it in GitHub Desktop.
Save kerneltoast/d5be6b0cab32f1887f45fb763d0529af to your computer and use it in GitHub Desktop.
From 4f021f03094a79b28d96c71b8ddea8314c253cab Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@openresty.com>
Date: Fri, 4 Dec 2020 15:11:19 -0800
Subject: [PATCH] linux: use relayfs properly to eliminate mutex_trylock from
print flush
Using a mutex_trylock() in __stp_print_flush() leads to a lot of havoc,
for numerous. Firstly, since __stp_print_flush() can be called from IRQ
context, holding the inode mutex from here would make the mutex owner
become nonsense, since mutex locks can only be held in contexts backed
by the scheduler. Secondly, the mutex_trylock implementation has a
spin_lock() inside of it that leads to two issues: IRQs aren't disabled
when acquiring this spin_lock(), so using it from IRQ context can lead
to a deadlock, and since spin locks can have tracepoints via
lock_acquire(), the spin_lock() can recurse on itself inside a stap
probe and deadlock, like so:
#0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a
#1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e
#2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b
#3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b
#4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba
#10 [ffff88017f6d7be8] mutex_trylock at ffffffff81882a27
#11 [ffff88017f6d7c20] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#12 [ffff88017f6d7c40] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#13 [ffff88017f6d7c78] _stp_vlog at ffffffffc09b8d32 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#14 [ffff88017f6d7cd8] _stp_dbug at ffffffffc09ba43b [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#15 [ffff88017f6d7d38] systemtap_module_refresh at ffffffffc09ba51d [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#16 [ffff88017f6d7d50] module_refresher at ffffffffc09ba53e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
#17 [ffff88017f6d7d60] process_one_work at ffffffff810da9cc
#18 [ffff88017f6d7de8] worker_thread at ffffffff810dafe6
#19 [ffff88017f6d7e48] kthread at ffffffff810e44cf
#20 [ffff88017f6d7f50] ret_from_fork_nospec_begin at ffffffff818958dd
The reason the mutex_trylock() was needed in the first place was because
staprun doesn't properly use the relayfs API when reading buffers in
non-bulk mode. It tries to read all CPUs' buffers from a single thread,
when it should be reading each CPU's buffer from a thread running on
said CPU in order to utilize relayfs' synchronization guarantees, which
are made by disabling IRQs on the local CPU when a buffer is modified.
This change makes staprun always use per-CPU threads to read print
buffers so that we don't need the mutex_trylock() in the print flush
routine, which resolves a wide variety of serious bugs.
---
runtime/print_flush.c | 127 +++------------------------------
runtime/transport/relay_v2.c | 128 +---------------------------------
runtime/transport/transport.c | 18 -----
runtime/transport/transport.h | 18 -----
staprun/relay.c | 40 ++++++-----
5 files changed, 36 insertions(+), 295 deletions(-)
diff --git a/runtime/print_flush.c b/runtime/print_flush.c
index acd6a32d9..f4d72d30f 100644
--- a/runtime/print_flush.c
+++ b/runtime/print_flush.c
@@ -18,6 +18,7 @@
static void __stp_print_flush(struct _stp_log *log)
{
+ char *bufp = log->buf;
size_t len = log->len;
void *entry = NULL;
@@ -26,126 +27,20 @@ static void __stp_print_flush(struct _stp_log *log)
return;
log->len = 0;
-
dbug_trans(1, "len = %zu\n", len);
-
-#ifdef STP_BULKMODE
-#ifdef NO_PERCPU_HEADERS
- {
- char *bufp = log->buf;
- int inode_locked;
-
- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
- atomic_inc (&_stp_transport_failures);
-#ifndef STP_TRANSPORT_RISKY
- return;
-#endif
- }
-
- while (len > 0) {
- size_t bytes_reserved;
-
- bytes_reserved = _stp_data_write_reserve(len, &entry);
- if (likely(entry && bytes_reserved > 0)) {
- memcpy(_stp_data_entry_data(entry), bufp,
- bytes_reserved);
- _stp_data_write_commit(entry);
- bufp += bytes_reserved;
- len -= bytes_reserved;
- }
- else {
- atomic_inc(&_stp_transport_failures);
- break;
- }
- }
-
- if (inode_locked)
- _stp_transport_unlock_relay_inode();
- }
-
-#else /* !NO_PERCPU_HEADERS */
-
- {
- char *bufp = log->buf;
- struct _stp_trace t = { .sequence = _stp_seq_inc(),
- .pdu_len = len};
+ do {
size_t bytes_reserved;
- int inode_locked;
- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
- atomic_inc (&_stp_transport_failures);
-#ifndef STP_TRANSPORT_RISKY
- return;
-#endif
- }
-
- bytes_reserved = _stp_data_write_reserve(sizeof(struct _stp_trace), &entry);
- if (likely(entry && bytes_reserved > 0)) {
- /* prevent unaligned access by using memcpy() */
- memcpy(_stp_data_entry_data(entry), &t, sizeof(t));
+ bytes_reserved = _stp_data_write_reserve(len, &entry);
+ if (likely(entry && bytes_reserved)) {
+ memcpy(_stp_data_entry_data(entry), bufp,
+ bytes_reserved);
_stp_data_write_commit(entry);
- }
- else {
+ bufp += bytes_reserved;
+ len -= bytes_reserved;
+ } else {
atomic_inc(&_stp_transport_failures);
- goto done;
+ break;
}
-
- while (len > 0) {
- bytes_reserved = _stp_data_write_reserve(len, &entry);
- if (likely(entry && bytes_reserved > 0)) {
- memcpy(_stp_data_entry_data(entry), bufp,
- bytes_reserved);
- _stp_data_write_commit(entry);
- bufp += bytes_reserved;
- len -= bytes_reserved;
- }
- else {
- atomic_inc(&_stp_transport_failures);
- break;
- }
- }
-
- done:
-
- if (inode_locked)
- _stp_transport_unlock_relay_inode();
- }
-#endif /* !NO_PERCPU_HEADERS */
-
-#else /* !STP_BULKMODE */
-
- {
- char *bufp = log->buf;
- int inode_locked;
-
- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
- atomic_inc (&_stp_transport_failures);
-#ifndef STP_TRANSPORT_RISKY
- dbug_trans(0, "discarding %zu bytes of data\n", len);
- return;
-#endif
- }
-
- dbug_trans(1, "calling _stp_data_write...\n");
- while (len > 0) {
- size_t bytes_reserved;
-
- bytes_reserved = _stp_data_write_reserve(len, &entry);
- if (likely(entry && bytes_reserved > 0)) {
- memcpy(_stp_data_entry_data(entry), bufp,
- bytes_reserved);
- _stp_data_write_commit(entry);
- bufp += bytes_reserved;
- len -= bytes_reserved;
- }
- else {
- atomic_inc(&_stp_transport_failures);
- break;
- }
- }
-
- if (inode_locked)
- _stp_transport_unlock_relay_inode();
- }
-#endif /* !STP_BULKMODE */
+ } while (len > 0);
}
diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c
index ff621f71d..8fecf1c7f 100644
--- a/runtime/transport/relay_v2.c
+++ b/runtime/transport/relay_v2.c
@@ -117,24 +117,17 @@ static void __stp_relay_wakeup_readers(struct rchan_buf *buf)
static void __stp_relay_wakeup_timer(stp_timer_callback_parameter_t unused)
{
-#ifdef STP_BULKMODE
int i;
-#endif
if (atomic_read(&_stp_relay_data.wakeup)) {
struct rchan_buf *buf;
atomic_set(&_stp_relay_data.wakeup, 0);
-#ifdef STP_BULKMODE
for_each_possible_cpu(i) {
buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
i);
__stp_relay_wakeup_readers(buf);
}
-#else
- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
- __stp_relay_wakeup_readers(buf);
-#endif
}
if (atomic_read(&_stp_relay_data.transport_state) == STP_TRANSPORT_RUNNING)
@@ -235,55 +228,8 @@ static void _stp_transport_data_fs_stop(void)
atomic_set (&_stp_relay_data.transport_state, STP_TRANSPORT_STOPPED);
del_timer_sync(&_stp_relay_data.timer);
dbug_trans(0, "flushing...\n");
- if (_stp_relay_data.rchan) {
- struct rchan_buf *buf;
-
- /* NB we cannot call relay_flush() directly here since
- * we need to do inode locking ourselves.
- */
-
-#ifdef STP_BULKMODE
- unsigned int i;
- struct rchan *rchan = _stp_relay_data.rchan;
-
- for_each_possible_cpu(i) {
- buf = _stp_get_rchan_subbuf(rchan->buf, i);
- if (buf) {
- struct inode *inode = buf->dentry->d_inode;
-
- /* NB we are in the syscall context which
- * allows sleeping. The following inode
- * locking might sleep. See PR26131. */
- _stp_lock_inode(inode);
-
- /* NB we intentionally avoids calling
- * our own __stp_relay_switch_subbuf()
- * since here we can sleep. */
- relay_switch_subbuf(buf, 0);
-
- _stp_unlock_inode(inode);
- }
- }
-#else /* !STP_BULKMODE */
- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
-
- if (buf != NULL) {
- struct inode *inode = buf->dentry->d_inode;
-
- /* NB we are in the syscall context which allows
- * sleeping. The following inode locking might
- * sleep. See PR26131. */
- _stp_lock_inode(inode);
-
- /* NB we intentionally avoids calling
- * our own __stp_relay_switch_subbuf()
- * since here we can sleep. */
- relay_switch_subbuf(buf, 0);
-
- _stp_unlock_inode(inode);
- }
-#endif
- }
+ if (_stp_relay_data.rchan)
+ relay_flush(_stp_relay_data.rchan);
}
}
@@ -308,9 +254,7 @@ static int _stp_transport_data_fs_init(void)
/* Create "trace" file. */
npages = _stp_subbuf_size * _stp_nsubbufs;
-#ifdef STP_BULKMODE
npages *= num_online_cpus();
-#endif
npages >>= PAGE_SHIFT;
si_meminfo(&si);
#define MB(i) (unsigned long)((i) >> (20 - PAGE_SHIFT))
@@ -347,9 +291,7 @@ static int _stp_transport_data_fs_init(void)
{
u64 relay_mem;
relay_mem = _stp_subbuf_size * _stp_nsubbufs;
-#ifdef STP_BULKMODE
relay_mem *= num_online_cpus();
-#endif
_stp_allocated_net_memory += relay_mem;
_stp_allocated_memory += relay_mem;
}
@@ -386,12 +328,7 @@ _stp_data_write_reserve(size_t size_request, void **entry)
return -EINVAL;
buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
-#ifdef STP_BULKMODE
- smp_processor_id()
-#else
- 0
-#endif
- );
+ smp_processor_id());
if (unlikely(buf->offset + size_request > buf->chan->subbuf_size)) {
size_request = __stp_relay_switch_subbuf(buf, size_request);
if (!size_request)
@@ -414,62 +351,3 @@ static int _stp_data_write_commit(void *entry)
/* Nothing to do here. */
return 0;
}
-
-static noinline int _stp_transport_trylock_relay_inode(void)
-{
- unsigned i;
- struct rchan_buf *buf;
- struct inode *inode;
-#ifdef DEBUG_TRANS
- cycles_t begin;
-#endif
-
- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
-#ifdef STP_BULKMODE
- smp_processor_id()
-#else
- 0
-#endif
- );
- if (buf == NULL)
- return 0;
-
- inode = buf->dentry->d_inode;
-
-#ifdef DEBUG_TRANS
- begin = get_cycles();
-#endif
-
- /* NB this bounded spinlock is needed for stream mode. it is observed
- * that almost all of the iterations needed are less than 50K iterations
- * or about 300K cycles.
- */
- for (i = 0; i < 50 * 1000; i++) {
- if (_stp_trylock_inode(inode)) {
- dbug_trans(3, "got inode lock: i=%u: cycles: %llu", i,
- get_cycles() - begin);
- return 1;
- }
- }
-
- dbug_trans(0, "failed to get inode lock: i=%u: cycles: %llu", i,
- get_cycles() - begin);
- return 0;
-}
-
-static void _stp_transport_unlock_relay_inode(void)
-{
- struct rchan_buf *buf;
-
- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
-#ifdef STP_BULKMODE
- smp_processor_id()
-#else
- 0
-#endif
- );
- if (buf == NULL)
- return;
-
- _stp_unlock_inode(buf->dentry->d_inode);
-}
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c
index 96426eb7b..207fc40dc 100644
--- a/runtime/transport/transport.c
+++ b/runtime/transport/transport.c
@@ -49,7 +49,6 @@ static int _stp_probes_started = 0;
* transport state flag is atomic. */
static atomic_t _stp_transport_state = ATOMIC_INIT(_STP_TS_UNINITIALIZED);
-static inline int _stp_trylock_inode(struct inode *inode);
static inline void _stp_lock_inode(struct inode *inode);
static inline void _stp_unlock_inode(struct inode *inode);
@@ -643,23 +642,6 @@ err0:
return -1;
}
-/* returns 1 when the lock is successfully acquired, 0 otherwise. */
-static inline int _stp_trylock_inode(struct inode *inode)
-{
-#ifdef STAPCONF_INODE_RWSEM
- return inode_trylock(inode);
-#else
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
- return mutex_trylock(&inode->i_mutex);
-#else
- /* NB down_trylock() uses a different convention where 0 means
- * the lock is successfully acquired.
- */
- return !down_trylock(&inode->i_sem);
-#endif
-#endif
-}
-
static inline void _stp_lock_inode(struct inode *inode)
{
#ifdef STAPCONF_INODE_RWSEM
diff --git a/runtime/transport/transport.h b/runtime/transport/transport.h
index 51723b7f5..cc09fc0ae 100644
--- a/runtime/transport/transport.h
+++ b/runtime/transport/transport.h
@@ -98,24 +98,6 @@ enum _stp_transport_state {
*/
static enum _stp_transport_state _stp_transport_get_state(void);
-/*
- * _stp_transport_trylock_relay_inode
- *
- * This function locks the relay file inode to protect against relay readers
- * (i.e., staprun/stapio).
- * Returns whether the lock is successfully obtained.
- */
-static noinline int _stp_transport_trylock_relay_inode(void);
-
-/*
- * _stp_transport_unlock_relay_inode
- *
- * This function releases the lock obtained by
- * _stp_transport_trylock_relay_inode.
- * should only call this when the lock is indeed obtained.
- */
-static void _stp_transport_unlock_relay_inode(void);
-
/*
* _stp_transport_data_fs_init
*
diff --git a/staprun/relay.c b/staprun/relay.c
index 2f5f2e06a..61b29e435 100644
--- a/staprun/relay.c
+++ b/staprun/relay.c
@@ -19,6 +19,7 @@ static int relay_fd[NR_CPUS];
static int avail_cpus[NR_CPUS];
static int switch_file[NR_CPUS];
static pthread_mutex_t mutex[NR_CPUS];
+static pthread_mutex_t stream_mutex;
static int bulkmode = 0;
static volatile int stop_threads = 0;
static time_t *time_backlog[NR_CPUS];
@@ -131,6 +132,7 @@ static void *reader_thread(void *data)
sigset_t sigs;
off_t wsize = 0;
int fnum = 0;
+ cpu_set_t cpu_mask;
sigemptyset(&sigs);
sigaddset(&sigs,SIGUSR2);
@@ -139,21 +141,18 @@ static void *reader_thread(void *data)
sigfillset(&sigs);
sigdelset(&sigs,SIGUSR2);
- if (bulkmode) {
- cpu_set_t cpu_mask;
- CPU_ZERO(&cpu_mask);
- CPU_SET(cpu, &cpu_mask);
- if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )
- _perr("sched_setaffinity");
+ CPU_ZERO(&cpu_mask);
+ CPU_SET(cpu, &cpu_mask);
+ if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )
+ _perr("sched_setaffinity");
#ifdef NEED_PPOLL
- /* Without a real ppoll, there is a small race condition that could */
- /* block ppoll(). So use a timeout to prevent that. */
- timeout->tv_sec = 10;
- timeout->tv_nsec = 0;
+ /* Without a real ppoll, there is a small race condition that could */
+ /* block ppoll(). So use a timeout to prevent that. */
+ timeout->tv_sec = 10;
+ timeout->tv_nsec = 0;
#else
- timeout = NULL;
+ timeout = NULL;
#endif
- }
if (reader_timeout_ms && timeout) {
timeout->tv_sec = reader_timeout_ms / 1000;
@@ -234,7 +233,13 @@ static void *reader_thread(void *data)
wbuf += bytes;
wsize += bytes;
} else {
- rc = write(out_fd[cpu], wbuf, wbytes);
+ if (bulkmode) {
+ rc = write(out_fd[cpu], wbuf, wbytes);
+ } else {
+ pthread_mutex_lock(&stream_mutex);
+ rc = write(out_fd[cpu], wbuf, wbytes);
+ pthread_mutex_unlock(&stream_mutex);
+ }
if (rc <= 0) {
perr("Couldn't write to output %d for cpu %d, exiting.",
out_fd[cpu], cpu);
@@ -358,11 +363,6 @@ int init_relayfs(void)
_err("couldn't open %s.\n", buf);
return -1;
}
- if (ncpus > 1 && bulkmode == 0) {
- _err("ncpus=%d, bulkmode = %d\n", ncpus, bulkmode);
- _err("This is inconsistent! Please file a bug report. Exiting now.\n");
- return -1;
- }
/* PR7097 */
if (load_only)
@@ -432,6 +432,10 @@ int init_relayfs(void)
sigaction(SIGUSR2, &sa, NULL);
dbug(2, "starting threads\n");
+ if (!bulkmode && pthread_mutex_init(&stream_mutex, NULL) < 0) {
+ _perr("failed to create stream mutex");
+ return -1;
+ }
for (i = 0; i < ncpus; i++) {
if (pthread_mutex_init(&mutex[avail_cpus[i]], NULL) < 0) {
_perr("failed to create mutex");
--
2.29.2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment