-
-
Save agentzh/517eef2a4d54554b2712c18cb79d214d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
commit daef24514707c9bdbc3c0bb7250c371a030e100a | |
Author: Yichun Zhang (agentzh) <yichun@openresty.com> | |
Date: Sat Jun 20 00:37:49 2020 -0700 | |
bugfix: relay_v2: there were no writer <=> reader locking protections which might lead to corrupted data (like "\0") and garbage data in the staprun output. | |
See also PR26131. | |
diff --git a/runtime/print_flush.c b/runtime/print_flush.c | |
index 18b3e8f62..5b28c4522 100644 | |
--- a/runtime/print_flush.c | |
+++ b/runtime/print_flush.c | |
@@ -37,7 +37,19 @@ void stp_print_flush(_stp_pbuf *pb) | |
#ifdef STP_BULKMODE | |
#ifdef NO_PERCPU_HEADERS | |
{ | |
+ struct context* __restrict__ c = NULL; | |
char *bufp = pb->buf; | |
+ int inode_locked; | |
+ | |
+ c = _stp_runtime_entryfn_get_context(); | |
+ | |
+ if (!(inode_locked = _stp_transport_trylock_relay_inode())) { | |
+ atomic_inc (&_stp_transport_failures); | |
+#ifndef STP_TRANSPORT_RISKY | |
+ _stp_runtime_entryfn_put_context(c); | |
+ return; | |
+#endif | |
+ } | |
while (len > 0) { | |
size_t bytes_reserved; | |
@@ -55,15 +67,32 @@ void stp_print_flush(_stp_pbuf *pb) | |
break; | |
} | |
} | |
+ | |
+ if (inode_locked) | |
+ _stp_transport_unlock_relay_inode(); | |
+ | |
+ _stp_runtime_entryfn_put_context(c); | |
} | |
#else /* !NO_PERCPU_HEADERS */ | |
{ | |
+ struct context* __restrict__ c = NULL; | |
char *bufp = pb->buf; | |
struct _stp_trace t = { .sequence = _stp_seq_inc(), | |
.pdu_len = len}; | |
size_t bytes_reserved; | |
+ int inode_locked; | |
+ | |
+ c = _stp_runtime_entryfn_get_context(); | |
+ | |
+ if (!(inode_locked = _stp_transport_trylock_relay_inode())) { | |
+ atomic_inc (&_stp_transport_failures); | |
+#ifndef STP_TRANSPORT_RISKY | |
+ _stp_runtime_entryfn_put_context(c); | |
+ return; | |
+#endif | |
+ } | |
bytes_reserved = _stp_data_write_reserve(sizeof(struct _stp_trace), &entry); | |
if (likely(entry && bytes_reserved > 0)) { | |
@@ -73,7 +102,7 @@ void stp_print_flush(_stp_pbuf *pb) | |
} | |
else { | |
atomic_inc(&_stp_transport_failures); | |
- return; | |
+ goto done; | |
} | |
while (len > 0) { | |
@@ -90,6 +119,13 @@ void stp_print_flush(_stp_pbuf *pb) | |
break; | |
} | |
} | |
+ | |
+ done: | |
+ | |
+ if (inode_locked) | |
+ _stp_transport_unlock_relay_inode(); | |
+ | |
+ _stp_runtime_entryfn_put_context(c); | |
} | |
#endif /* !NO_PERCPU_HEADERS */ | |
@@ -110,6 +146,7 @@ void stp_print_flush(_stp_pbuf *pb) | |
unsigned long flags; | |
struct context* __restrict__ c = NULL; | |
char *bufp = pb->buf; | |
+ int inode_locked; | |
/* Prevent probe reentrancy on _stp_print_lock. | |
* | |
@@ -129,6 +166,15 @@ void stp_print_flush(_stp_pbuf *pb) | |
*/ | |
c = _stp_runtime_entryfn_get_context(); | |
+ if (!(inode_locked = _stp_transport_trylock_relay_inode())) { | |
+ atomic_inc (&_stp_transport_failures); | |
+#ifndef STP_TRANSPORT_RISKY | |
+ //errk("discarding %zu bytes of data\n", len); | |
+ _stp_runtime_entryfn_put_context(c); | |
+ return; | |
+#endif | |
+ } | |
+ | |
dbug_trans(1, "calling _stp_data_write...\n"); | |
stp_spin_lock_irqsave(&_stp_print_lock, flags); | |
while (len > 0) { | |
@@ -148,6 +194,10 @@ void stp_print_flush(_stp_pbuf *pb) | |
} | |
} | |
stp_spin_unlock_irqrestore(&_stp_print_lock, flags); | |
+ | |
+ if (inode_locked) | |
+ _stp_transport_unlock_relay_inode(); | |
+ | |
_stp_runtime_entryfn_put_context(c); | |
} | |
#endif /* STP_TRANSPORT_VERSION != 1 */ | |
diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c | |
index 135951a8e..b631b1b79 100644 | |
--- a/runtime/transport/relay_v2.c | |
+++ b/runtime/transport/relay_v2.c | |
@@ -269,12 +269,59 @@ static void _stp_transport_data_fs_start(void) | |
static void _stp_transport_data_fs_stop(void) | |
{ | |
+ | |
if (atomic_read (&_stp_relay_data.transport_state) == STP_TRANSPORT_RUNNING) { | |
atomic_set (&_stp_relay_data.transport_state, STP_TRANSPORT_STOPPED); | |
del_timer_sync(&_stp_relay_data.timer); | |
dbug_trans(0, "flushing...\n"); | |
- if (_stp_relay_data.rchan) | |
- relay_flush(_stp_relay_data.rchan); | |
+ if (_stp_relay_data.rchan) { | |
+ struct rchan_buf *buf; | |
+ | |
+ // NB we cannot call relay_flush() directly here since | |
+ // we need to do inode locking ourselves. | |
+ | |
+#ifdef STP_BULKMODE | |
+ unsigned int i; | |
+ struct rchan *rchan = _stp_relay_data.rchan; | |
+ | |
+ for_each_possible_cpu(i) { | |
+ buf = _stp_get_rchan_subbuf(rchan->buf, i); | |
+ if (buf) { | |
+ struct inode *inode = buf->dentry->d_inode; | |
+ | |
+ /* NB we are in the syscall context which | |
+ * allows sleeping. The following inode | |
+ * locking might sleep. See PR26131. */ | |
+ _stp_lock_inode(inode); | |
+ | |
+ /* NB we intentionally avoids calling | |
+ * our own __stp_relay_switch_subbuf() | |
+ * since here we can sleep here */ | |
+ relay_switch_subbuf(buf, 0); | |
+ | |
+ _stp_unlock_inode(inode); | |
+ } | |
+ } | |
+#else /* !STP_BULKMODE */ | |
+ buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0); | |
+ | |
+ if (buf != NULL) { | |
+ struct inode *inode = buf->dentry->d_inode; | |
+ | |
+ /* NB we are in the syscall context which allows | |
+ * sleeping. The following inode locking might | |
+ * sleep. See PR26131. */ | |
+ _stp_lock_inode(inode); | |
+ | |
+ /* NB we intentionally avoids calling | |
+ * our own __stp_relay_switch_subbuf() | |
+ * since here we can sleep here */ | |
+ relay_switch_subbuf(buf, 0); | |
+ | |
+ _stp_unlock_inode(inode); | |
+ } | |
+#endif | |
+ } | |
} | |
} | |
@@ -403,7 +450,12 @@ _stp_data_write_reserve(size_t size_request, void **entry) | |
return -EINVAL; | |
buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, | |
- smp_processor_id()); | |
+#ifdef STP_BULKMODE | |
+ smp_processor_id() | |
+#else | |
+ 0 | |
+#endif | |
+ ); | |
if (unlikely(buf->offset + size_request > buf->chan->subbuf_size)) { | |
size_request = __stp_relay_switch_subbuf(buf, size_request); | |
if (!size_request) | |
@@ -426,3 +478,61 @@ static int _stp_data_write_commit(void *entry) | |
/* Nothing to do here. */ | |
return 0; | |
} | |
+ | |
+static noinline int _stp_transport_trylock_relay_inode(void) | |
+{ | |
+ unsigned i; | |
+ struct rchan_buf *buf; | |
+ struct inode *inode; | |
+#ifdef DEBUG_TRANS | |
+ cycles_t begin; | |
+#endif | |
+ | |
+ buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, | |
+#ifdef STP_BULKMODE | |
+ smp_processor_id() | |
+#else | |
+ 0 | |
+#endif | |
+ ); | |
+ if (buf == NULL) | |
+ return 0; | |
+ | |
+ inode = buf->dentry->d_inode; | |
+ | |
+#ifdef DEBUG_TRANS | |
+ begin = get_cycles(); | |
+#endif | |
+ | |
+ // NB this bounded spinlock is needed for stream mode. it is observed | |
+ // that almost all of the iterations needed are less than 50K iterations | |
+ // or about 300K cycles. | |
+ for (i = 0; i < 50 * 1000; i++) { | |
+ if (_stp_trylock_inode(inode)) { | |
+ dbug_trans(3, "got inode lock: i=%u: cycles: %llu", i, | |
+ get_cycles() - begin); | |
+ return 1; | |
+ } | |
+ } | |
+ | |
+ dbug_trans(0, "failed to get inode lock: i=%u: cycles: %llu", i, | |
+ get_cycles() - begin); | |
+ return 0; | |
+} | |
+ | |
+static void _stp_transport_unlock_relay_inode(void) | |
+{ | |
+ struct rchan_buf *buf; | |
+ | |
+ buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, | |
+#ifdef STP_BULKMODE | |
+ smp_processor_id() | |
+#else | |
+ 0 | |
+#endif | |
+ ); | |
+ if (buf == NULL) | |
+ return; | |
+ | |
+ _stp_unlock_inode(buf->dentry->d_inode); | |
+} | |
diff --git a/runtime/transport/relayfs.c b/runtime/transport/relayfs.c | |
index d8fa4e049..97d2dff8b 100644 | |
--- a/runtime/transport/relayfs.c | |
+++ b/runtime/transport/relayfs.c | |
@@ -186,3 +186,14 @@ static int _stp_data_write_commit(void *entry) | |
/* Nothing to do here. */ | |
return 0; | |
} | |
+ | |
+static int _stp_transport_trylock_relay_inode(void) | |
+{ | |
+ /* Nothing to do here. */ | |
+ return 1; | |
+} | |
+ | |
+static void _stp_transport_unlock_relay_inode(void) | |
+{ | |
+ /* Nothing to do here. */ | |
+} | |
diff --git a/runtime/transport/ring_buffer.c b/runtime/transport/ring_buffer.c | |
index 43e014bae..21e356973 100644 | |
--- a/runtime/transport/ring_buffer.c | |
+++ b/runtime/transport/ring_buffer.c | |
@@ -827,3 +827,14 @@ static void _stp_transport_data_fs_overwrite(int overwrite) | |
dbug_trans(0, "setting ovewrite to %d\n", overwrite); | |
_stp_relay_data.overwrite_flag = overwrite; | |
} | |
+ | |
+static int _stp_transport_trylock_relay_inode(void) | |
+{ | |
+ /* Nothing to do here. */ | |
+ return 1; | |
+} | |
+ | |
+static void _stp_transport_unlock_relay_inode(void) | |
+{ | |
+ /* Nothing to do here. */ | |
+} | |
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c | |
index 863a0c427..65eb9f791 100644 | |
--- a/runtime/transport/transport.c | |
+++ b/runtime/transport/transport.c | |
@@ -45,6 +45,10 @@ static int _stp_probes_started = 0; | |
* transport state flag is atomic. */ | |
static atomic_t _stp_transport_state = ATOMIC_INIT(_STP_TS_UNINITIALIZED); | |
+static inline int _stp_trylock_inode(struct inode *inode); | |
+static inline void _stp_lock_inode(struct inode *inode); | |
+static inline void _stp_unlock_inode(struct inode *inode); | |
+ | |
#ifndef STP_CTL_TIMER_INTERVAL | |
/* ctl timer interval in jiffies (default 20 ms) */ | |
#define STP_CTL_TIMER_INTERVAL ((HZ+49)/50) | |
@@ -497,6 +501,21 @@ err0: | |
return -1; | |
} | |
+/* returns 1 when the lock is successfully acquired, 0 otherwise. */ | |
+static inline int _stp_trylock_inode(struct inode *inode) | |
+{ | |
+#ifdef STAPCONF_INODE_RWSEM | |
+ return inode_trylock(inode); | |
+#else | |
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) | |
+ might_sleep(); | |
+ return mutex_trylock(&inode->i_mutex); | |
+#else | |
+ return ! down_trylock(&inode->i_sem); | |
+#endif | |
+#endif | |
+} | |
+ | |
static inline void _stp_lock_inode(struct inode *inode) | |
{ | |
#ifdef STAPCONF_INODE_RWSEM | |
diff --git a/runtime/transport/transport.h b/runtime/transport/transport.h | |
index 0cf2a7ca3..9f0f9ddf8 100644 | |
--- a/runtime/transport/transport.h | |
+++ b/runtime/transport/transport.h | |
@@ -67,6 +67,24 @@ enum _stp_transport_state { | |
*/ | |
static enum _stp_transport_state _stp_transport_get_state(void); | |
+/* | |
+ * _stp_transport_trylock_relay_inode | |
+ * | |
+ * This function locks the relay file inode to protect against relay readers | |
+ * (i.e., staprun/stapio). | |
+ * Returns whether the lock is successfully obtained. | |
+ */ | |
+static noinline int _stp_transport_trylock_relay_inode(void); | |
+ | |
+/* | |
+ * _stp_transport_unlock_relay_inode | |
+ * | |
+ * This function releases the lock obtained by | |
+ * _stp_transport_trylock_relay_inode. | |
+ * should only call this when the lock is indeed obtained. | |
+ */ | |
+static void _stp_transport_unlock_relay_inode(void); | |
+ | |
/* | |
* _stp_transport_data_fs_init | |
* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment