Skip to content

Instantly share code, notes, and snippets.

@agentzh
Last active June 22, 2020 01:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agentzh/8c1c3eeda4efce2e5a9d2cce8252391d to your computer and use it in GitHub Desktop.
Save agentzh/8c1c3eeda4efce2e5a9d2cce8252391d to your computer and use it in GitHub Desktop.
commit 27a33d91568fecaa5d9c76ac1996ce32a64e25ba
Author: Yichun Zhang (agentzh) <yichun@openresty.com>
Date: Sat Jun 20 00:37:49 2020 -0700
bugfix: relay_v2: there were no writer <=> reader locking protections which might lead to corrupted data (like "\0") and garbage data in the staprun output.
See also PR26131.
diff --git a/runtime/print_flush.c b/runtime/print_flush.c
index 18b3e8f62..17d4c0a04 100644
--- a/runtime/print_flush.c
+++ b/runtime/print_flush.c
@@ -37,7 +37,19 @@ void stp_print_flush(_stp_pbuf *pb)
#ifdef STP_BULKMODE
#ifdef NO_PERCPU_HEADERS
{
+ struct context* __restrict__ c = NULL;
char *bufp = pb->buf;
+ int inode_locked;
+
+ c = _stp_runtime_entryfn_get_context();
+
+ if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
+ atomic_inc (&_stp_transport_failures);
+#ifndef STP_TRANSPORT_RISKY
+ _stp_runtime_entryfn_put_context(c);
+ return;
+#endif
+ }
while (len > 0) {
size_t bytes_reserved;
@@ -55,15 +67,32 @@ void stp_print_flush(_stp_pbuf *pb)
break;
}
}
+
+ if (inode_locked)
+ _stp_transport_unlock_relay_inode();
+
+ _stp_runtime_entryfn_put_context(c);
}
#else /* !NO_PERCPU_HEADERS */
{
+ struct context* __restrict__ c = NULL;
char *bufp = pb->buf;
struct _stp_trace t = { .sequence = _stp_seq_inc(),
.pdu_len = len};
size_t bytes_reserved;
+ int inode_locked;
+
+ c = _stp_runtime_entryfn_get_context();
+
+ if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
+ atomic_inc (&_stp_transport_failures);
+#ifndef STP_TRANSPORT_RISKY
+ _stp_runtime_entryfn_put_context(c);
+ return;
+#endif
+ }
bytes_reserved = _stp_data_write_reserve(sizeof(struct _stp_trace), &entry);
if (likely(entry && bytes_reserved > 0)) {
@@ -73,7 +102,7 @@ void stp_print_flush(_stp_pbuf *pb)
}
else {
atomic_inc(&_stp_transport_failures);
- return;
+ goto done;
}
while (len > 0) {
@@ -90,6 +119,13 @@ void stp_print_flush(_stp_pbuf *pb)
break;
}
}
+
+ done:
+
+ if (inode_locked)
+ _stp_transport_unlock_relay_inode();
+
+ _stp_runtime_entryfn_put_context(c);
}
#endif /* !NO_PERCPU_HEADERS */
@@ -110,6 +146,7 @@ void stp_print_flush(_stp_pbuf *pb)
unsigned long flags;
struct context* __restrict__ c = NULL;
char *bufp = pb->buf;
+ int inode_locked;
/* Prevent probe reentrancy on _stp_print_lock.
*
@@ -129,6 +166,15 @@ void stp_print_flush(_stp_pbuf *pb)
*/
c = _stp_runtime_entryfn_get_context();
+ if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
+ atomic_inc (&_stp_transport_failures);
+#ifndef STP_TRANSPORT_RISKY
+ dbug_trans(0, "discarding %zu bytes of data\n", len);
+ _stp_runtime_entryfn_put_context(c);
+ return;
+#endif
+ }
+
dbug_trans(1, "calling _stp_data_write...\n");
stp_spin_lock_irqsave(&_stp_print_lock, flags);
while (len > 0) {
@@ -148,6 +194,10 @@ void stp_print_flush(_stp_pbuf *pb)
}
}
stp_spin_unlock_irqrestore(&_stp_print_lock, flags);
+
+ if (inode_locked)
+ _stp_transport_unlock_relay_inode();
+
_stp_runtime_entryfn_put_context(c);
}
#endif /* STP_TRANSPORT_VERSION != 1 */
diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c
index 135951a8e..9903b72c1 100644
--- a/runtime/transport/relay_v2.c
+++ b/runtime/transport/relay_v2.c
@@ -269,12 +269,60 @@ static void _stp_transport_data_fs_start(void)
static void _stp_transport_data_fs_stop(void)
{
+
if (atomic_read (&_stp_relay_data.transport_state) == STP_TRANSPORT_RUNNING) {
atomic_set (&_stp_relay_data.transport_state, STP_TRANSPORT_STOPPED);
del_timer_sync(&_stp_relay_data.timer);
dbug_trans(0, "flushing...\n");
- if (_stp_relay_data.rchan)
- relay_flush(_stp_relay_data.rchan);
+ if (_stp_relay_data.rchan) {
+ struct rchan_buf *buf;
+
+ /* NB we cannot call relay_flush() directly here since
+ * we need to do inode locking ourselves.
+ */
+
+#ifdef STP_BULKMODE
+ unsigned int i;
+ struct rchan *rchan = _stp_relay_data.rchan;
+
+ for_each_possible_cpu(i) {
+ buf = _stp_get_rchan_subbuf(rchan->buf, i);
+ if (buf) {
+ struct inode *inode = buf->dentry->d_inode;
+
+ /* NB we are in the syscall context which
+ * allows sleeping. The following inode
+ * locking might sleep. See PR26131. */
+ _stp_lock_inode(inode);
+
+ /* NB we intentionally avoids calling
+ * our own __stp_relay_switch_subbuf()
+ * since here we can sleep here */
+ relay_switch_subbuf(buf, 0);
+
+ _stp_unlock_inode(inode);
+ }
+ }
+#else /* !STP_BULKMODE */
+ buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
+
+ if (buf != NULL) {
+ struct inode *inode = buf->dentry->d_inode;
+
+ /* NB we are in the syscall context which allows
+ * sleeping. The following inode locking might
+ * sleep. See PR26131. */
+ _stp_lock_inode(inode);
+
+ /* NB we intentionally avoids calling
+ * our own __stp_relay_switch_subbuf()
+ * since here we can sleep here */
+ relay_switch_subbuf(buf, 0);
+
+ _stp_unlock_inode(inode);
+ }
+#endif
+ }
}
}
@@ -403,7 +451,12 @@ _stp_data_write_reserve(size_t size_request, void **entry)
return -EINVAL;
buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
- smp_processor_id());
+#ifdef STP_BULKMODE
+ smp_processor_id()
+#else
+ 0
+#endif
+ );
if (unlikely(buf->offset + size_request > buf->chan->subbuf_size)) {
size_request = __stp_relay_switch_subbuf(buf, size_request);
if (!size_request)
@@ -426,3 +479,62 @@ static int _stp_data_write_commit(void *entry)
/* Nothing to do here. */
return 0;
}
+
+static noinline int _stp_transport_trylock_relay_inode(void)
+{
+ unsigned i;
+ struct rchan_buf *buf;
+ struct inode *inode;
+#ifdef DEBUG_TRANS
+ cycles_t begin;
+#endif
+
+ buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
+#ifdef STP_BULKMODE
+ smp_processor_id()
+#else
+ 0
+#endif
+ );
+ if (buf == NULL)
+ return 0;
+
+ inode = buf->dentry->d_inode;
+
+#ifdef DEBUG_TRANS
+ begin = get_cycles();
+#endif
+
+ /* NB this bounded spinlock is needed for stream mode. it is observed
+ * that almost all of the iterations needed are less than 50K iterations
+ * or about 300K cycles.
+ */
+ for (i = 0; i < 50 * 1000; i++) {
+ if (_stp_trylock_inode(inode)) {
+ dbug_trans(3, "got inode lock: i=%u: cycles: %llu", i,
+ get_cycles() - begin);
+ return 1;
+ }
+ }
+
+ dbug_trans(0, "failed to get inode lock: i=%u: cycles: %llu", i,
+ get_cycles() - begin);
+ return 0;
+}
+
+static void _stp_transport_unlock_relay_inode(void)
+{
+ struct rchan_buf *buf;
+
+ buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
+#ifdef STP_BULKMODE
+ smp_processor_id()
+#else
+ 0
+#endif
+ );
+ if (buf == NULL)
+ return;
+
+ _stp_unlock_inode(buf->dentry->d_inode);
+}
diff --git a/runtime/transport/relayfs.c b/runtime/transport/relayfs.c
index d8fa4e049..97d2dff8b 100644
--- a/runtime/transport/relayfs.c
+++ b/runtime/transport/relayfs.c
@@ -186,3 +186,14 @@ static int _stp_data_write_commit(void *entry)
/* Nothing to do here. */
return 0;
}
+
+static int _stp_transport_trylock_relay_inode(void)
+{
+ /* Nothing to do here. */
+ return 1;
+}
+
+static void _stp_transport_unlock_relay_inode(void)
+{
+ /* Nothing to do here. */
+}
diff --git a/runtime/transport/ring_buffer.c b/runtime/transport/ring_buffer.c
index 43e014bae..21e356973 100644
--- a/runtime/transport/ring_buffer.c
+++ b/runtime/transport/ring_buffer.c
@@ -827,3 +827,14 @@ static void _stp_transport_data_fs_overwrite(int overwrite)
dbug_trans(0, "setting ovewrite to %d\n", overwrite);
_stp_relay_data.overwrite_flag = overwrite;
}
+
+static int _stp_transport_trylock_relay_inode(void)
+{
+ /* Nothing to do here. */
+ return 1;
+}
+
+static void _stp_transport_unlock_relay_inode(void)
+{
+ /* Nothing to do here. */
+}
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c
index 863a0c427..1420614eb 100644
--- a/runtime/transport/transport.c
+++ b/runtime/transport/transport.c
@@ -45,6 +45,10 @@ static int _stp_probes_started = 0;
* transport state flag is atomic. */
static atomic_t _stp_transport_state = ATOMIC_INIT(_STP_TS_UNINITIALIZED);
+static inline int _stp_trylock_inode(struct inode *inode);
+static inline void _stp_lock_inode(struct inode *inode);
+static inline void _stp_unlock_inode(struct inode *inode);
+
#ifndef STP_CTL_TIMER_INTERVAL
/* ctl timer interval in jiffies (default 20 ms) */
#define STP_CTL_TIMER_INTERVAL ((HZ+49)/50)
@@ -497,6 +501,23 @@ err0:
return -1;
}
+/* returns 1 when the lock is successfully acquired, 0 otherwise. */
+static inline int _stp_trylock_inode(struct inode *inode)
+{
+#ifdef STAPCONF_INODE_RWSEM
+ return inode_trylock(inode);
+#else
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+ return mutex_trylock(&inode->i_mutex);
+#else
+ /* NB down_trylock() uses a different convention where 0 means
+ * the lock is successfully acquired.
+ */
+ return !down_trylock(&inode->i_sem);
+#endif
+#endif
+}
+
static inline void _stp_lock_inode(struct inode *inode)
{
#ifdef STAPCONF_INODE_RWSEM
diff --git a/runtime/transport/transport.h b/runtime/transport/transport.h
index 0cf2a7ca3..9f0f9ddf8 100644
--- a/runtime/transport/transport.h
+++ b/runtime/transport/transport.h
@@ -67,6 +67,24 @@ enum _stp_transport_state {
*/
static enum _stp_transport_state _stp_transport_get_state(void);
+/*
+ * _stp_transport_trylock_relay_inode
+ *
+ * This function locks the relay file inode to protect against relay readers
+ * (i.e., staprun/stapio).
+ * Returns whether the lock is successfully obtained.
+ */
+static noinline int _stp_transport_trylock_relay_inode(void);
+
+/*
+ * _stp_transport_unlock_relay_inode
+ *
+ * This function releases the lock obtained by
+ * _stp_transport_trylock_relay_inode.
+ * should only call this when the lock is indeed obtained.
+ */
+static void _stp_transport_unlock_relay_inode(void);
+
/*
* _stp_transport_data_fs_init
*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment