Skip to content

Instantly share code, notes, and snippets.

@agentzh
Created December 17, 2023 04:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agentzh/d54da2db9521f26fd6f700d270f48334 to your computer and use it in GitHub Desktop.
Save agentzh/d54da2db9521f26fd6f700d270f48334 to your computer and use it in GitHub Desktop.
commit d0e9867d305f01b51d8c5821a340ee0bdae0f1d2
Author: Yichun Zhang (agentzh) <yichun@openresty.com>
Date: Sat Dec 16 17:35:03 2023 -0800
PR31176: fix Spin lock deadlocks in memory pool allocations for mixed NMI and non-NMI contexts
The kernel's lockdep finds dead locks in the stap memory pool allocator's
spinlocks when mixing NMI and non-NMI contexts.
Now we use trylock in the NMI context to avoid waiting forever on a lock
held by a non-NMI context (which is interrupted by NMI).
This bug can be reproduced by running TEST 4 in
testsuite/systemtap.base/kernel-hw-breakpoint-addr.exp on a lockdep
kernel that is recent enough (like 5.11.22).
diff --git a/runtime/mempool.c b/runtime/mempool.c
index 1f206b216..661ea2107 100644
--- a/runtime/mempool.c
+++ b/runtime/mempool.c
@@ -75,20 +75,27 @@ err:
/* allocate a buffer from a memory pool */
static void *_stp_mempool_alloc(_stp_mempool_t *pool)
{
- unsigned long flags;
+ unsigned long flags = 0;
struct _stp_mem_buffer *ptr = NULL;
+ void *ret = NULL;
+
/* PR14804: tolerate accidental early call, before pool is
actually initialized. */
if (pool == NULL)
return NULL;
- stp_spin_lock_irqsave(&pool->lock, flags);
+
+ stp_nmi_spin_lock_irqsave(&pool->lock, flags, no_lock);
+
if (likely(!list_empty(&pool->free_list))) {
ptr = (struct _stp_mem_buffer *)pool->free_list.next;
list_del_init(&ptr->list);
- stp_spin_unlock_irqrestore(&pool->lock, flags);
- return &ptr->buf;
+ ret = &ptr->buf;
}
- stp_spin_unlock_irqrestore(&pool->lock, flags);
+
+ stp_nmi_spin_unlock_irqrestore(&pool->lock, flags);
+ return ret;
+
+no_lock:
return NULL;
}
diff --git a/runtime/stp_helper_lock.h b/runtime/stp_helper_lock.h
index d2961b74c..72ac72c2c 100644
--- a/runtime/stp_helper_lock.h
+++ b/runtime/stp_helper_lock.h
@@ -17,6 +17,22 @@
#include <linux/spinlock.h>
+#define stp_nmi_spin_lock_irqsave(lock, flags, label) \
+ if (in_nmi()) { \
+ if (!stp_spin_trylock(lock)) { \
+ goto label; \
+ } \
+ } else { \
+ stp_spin_lock_irqsave(lock, flags); \
+ }
+
+#define stp_nmi_spin_unlock_irqrestore(lock, flags) \
+ if (in_nmi()) { \
+ stp_spin_unlock(lock); \
+ } else { \
+ stp_spin_unlock_irqrestore(lock, flags); \
+ }
+
#if defined(CONFIG_PREEMPT_RT_FULL) || defined(CONFIG_PREEMPT_RT)
#define stp_spinlock_t raw_spinlock_t
@@ -28,6 +44,7 @@ static inline void stp_spin_lock_init(raw_spinlock_t *lock) { raw_spin_lock_init
static inline void stp_spin_lock(raw_spinlock_t *lock) { raw_spin_lock(lock); }
static inline void stp_spin_unlock(raw_spinlock_t *lock) { raw_spin_unlock(lock); }
+#define stp_spin_trylock(lock) raw_spin_trylock(lock)
#define stp_spin_lock_irqsave(lock, flags) raw_spin_lock_irqsave(lock, flags)
#define stp_spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
@@ -61,6 +78,7 @@ static inline void stp_spin_lock_init(spinlock_t *lock) { spin_lock_init(lock);
static inline void stp_spin_lock(spinlock_t *lock) { spin_lock(lock); }
static inline void stp_spin_unlock(spinlock_t *lock) { spin_unlock(lock); }
+#define stp_spin_trylock(lock) spin_trylock(lock)
#define stp_spin_lock_irqsave(lock, flags) spin_lock_irqsave(lock, flags)
#define stp_spin_unlock_irqrestore(lock, flags) spin_unlock_irqrestore(lock, flags)
diff --git a/runtime/transport/control.c b/runtime/transport/control.c
index 35608731c..2306b8913 100644
--- a/runtime/transport/control.c
+++ b/runtime/transport/control.c
@@ -418,7 +418,7 @@ static void _stp_ctl_free_special_buffers(void)
static struct _stp_buffer *_stp_ctl_get_buffer(int type, const char *data,
unsigned len)
{
- unsigned long flags;
+ unsigned long flags = 0;
struct _stp_buffer *bptr = NULL;
/* Is it a dynamically allocated message type? */
@@ -481,12 +481,12 @@ static struct _stp_buffer *_stp_ctl_get_buffer(int type, const char *data,
}
if (bptr != NULL) {
/* OK, it is a special one, but is it free? */
- stp_spin_lock_irqsave(&_stp_ctl_special_msg_lock, flags);
+ stp_nmi_spin_lock_irqsave(&_stp_ctl_special_msg_lock, flags, failed);
if (bptr->type == _STP_CTL_MSG_UNUSED)
bptr->type = type;
else
bptr = NULL;
- stp_spin_unlock_irqrestore(&_stp_ctl_special_msg_lock, flags);
+ stp_nmi_spin_unlock_irqrestore(&_stp_ctl_special_msg_lock, flags);
}
/* Got a special message buffer, with type set, fill it in,
@@ -501,6 +501,9 @@ static struct _stp_buffer *_stp_ctl_get_buffer(int type, const char *data,
}
}
return bptr;
+
+failed:
+ return NULL;
}
/* Returns the given buffer to the pool when dynamically allocated.
@@ -535,7 +538,7 @@ static int _stp_ctl_send(int type, void *data, unsigned len)
{
struct context* __restrict__ c = NULL;
struct _stp_buffer *bptr;
- unsigned long flags;
+ unsigned long flags = 0;
unsigned hlen;
#ifdef DEBUG_TRANS
@@ -567,22 +570,23 @@ static int _stp_ctl_send(int type, void *data, unsigned len)
skipped rather than deadlock. */
c = _stp_runtime_entryfn_get_context();
+ stp_nmi_spin_lock_irqsave(&_stp_ctl_ready_lock, flags, no_lock);
+
/* get a buffer from the free pool */
bptr = _stp_ctl_get_buffer(type, data, len);
if (unlikely(bptr == NULL)) {
/* Nothing else we can do... but let's not spam the kernel
with these reports. */
/* printk(KERN_ERR "ctl_write_msg type=%d len=%d ENOMEM\n", type, len); */
- _stp_runtime_entryfn_put_context(c);
- return -ENOMEM;
+ goto no_mem;
}
/* Put it on the pool of ready buffers. It's possible to recursively
hit a probe here, like a kprobe in NMI or the lock tracepoints, but
they will be squashed since we're holding the context busy. */
- stp_spin_lock_irqsave(&_stp_ctl_ready_lock, flags);
list_add_tail(&bptr->list, &_stp_ctl_ready_q);
- stp_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);
+
+ stp_nmi_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);
_stp_runtime_entryfn_put_context(c);
@@ -590,6 +594,15 @@ static int _stp_ctl_send(int type, void *data, unsigned len)
timer at this point, but calling mod_timer() at this
point would bring in more locking issues... */
return len + sizeof(bptr->type);
+
+no_lock:
+ _stp_runtime_entryfn_put_context(c);
+ return -EBUSY;
+
+no_mem:
+ stp_nmi_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);
+ _stp_runtime_entryfn_put_context(c);
+ return -ENOMEM;
}
/* Logs a warning or error through the control channel. This function mimics
@@ -603,12 +616,15 @@ static void _stp_ctl_log_werr(const char *logtype, size_t logtype_len,
{
struct context *__restrict__ c;
struct _stp_buffer *bptr;
- unsigned long flags;
+ unsigned long flags = 0;
c = _stp_runtime_entryfn_get_context();
+
+ stp_nmi_spin_lock_irqsave(&_stp_ctl_ready_lock, flags, put_context);
+
bptr = _stp_ctl_get_buffer(STP_OOB_DATA, logtype, logtype_len);
if (!bptr)
- goto put_context;
+ goto unlock;
/*
* This is a generic failure message for when there's no space left. We
@@ -634,9 +650,11 @@ static void _stp_ctl_log_werr(const char *logtype, size_t logtype_len,
bptr->buf[bptr->len++] = '\n';
send_msg:
- stp_spin_lock_irqsave(&_stp_ctl_ready_lock, flags);
list_add_tail(&bptr->list, &_stp_ctl_ready_q);
- stp_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);
+
+unlock:
+ stp_nmi_spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags);
+
put_context:
_stp_runtime_entryfn_put_context(c);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment