Skip to content

Instantly share code, notes, and snippets.

@agentzh
Last active May 18, 2023 05:05
Show Gist options
  • Save agentzh/0628614ac04f773c6662d225100924be to your computer and use it in GitHub Desktop.
Save agentzh/0628614ac04f773c6662d225100924be to your computer and use it in GitHub Desktop.
commit dce3fd8814d3666996b7b22b121f36f953176daf
Author: Yichun Zhang (agentzh) <yichun@openresty.com>
Date: Tue May 16 17:17:07 2023 -0700
PR30456: kernel 5.18+ removed set_fs() and we should use the new replacement, user_access_begin()/user_access_end() for userland memory accesses.
Alas. user_access_begin() in newer kernels also performs access_ok() itself.
We pay a small price of duplicate access_ok() checks here for now.
We also use __access_ok() instead of access_ok() to avoid warnings on debug kernels.
We use __always_inline instead of inline for the C functions which might
be within the regions with UACCESS enabled (to please objtool's checker).
diff --git a/buildrun.cxx b/buildrun.cxx
index bff852c14..d783a0393 100644
--- a/buildrun.cxx
+++ b/buildrun.cxx
@@ -614,6 +614,11 @@ compile_pass (systemtap_session& s)
// used by userland memory reads
output_autoconf(s, o, cs, "autoconf-nmi-uaccess-okay.c", "STAPCONF_NMI_UACCESS_OKAY", NULL);
+ output_autoconf(s, o, cs, "autoconf-asm-access-ok.c", "STAPCONF_ASM_ACCESS_OK", NULL);
+ output_autoconf(s, o, cs, "autoconf-user-access-begin-2-args.c", "STAPCONF_USER_ACCESS_BEGIN_2_ARGS", NULL);
+ output_autoconf(s, o, cs, "autoconf-user-access-begin-3-args.c", "STAPCONF_USER_ACCESS_BEGIN_3_ARGS", NULL);
+ output_autoconf(s, o, cs, "autoconf-user-access-end.c", "STAPCONF_USER_ACCESS_END", NULL);
+ output_autoconf(s, o, cs, "autoconf-asm-tlbflush-h.c", "STAPCONF_ASM_TLBFLUSH_H", NULL);
// used by runtime/uprobe-inode.c
output_either_exportconf(s, o2, "uprobe_register", "register_uprobe",
diff --git a/runtime/dyninst/loc2c-runtime.h b/runtime/dyninst/loc2c-runtime.h
index 978c143ec..f46ead198 100644
--- a/runtime/dyninst/loc2c-runtime.h
+++ b/runtime/dyninst/loc2c-runtime.h
@@ -77,5 +77,9 @@
#define kderef uderef
#define store_kderef store_uderef
+#define stp_mem_access_begin(type, ptr, size, oldfs, seg, is_user_ptr)
+#define stp_mem_access_end(oldfs, is_user)
+#define stp_user_access_begin(type, ptr, size, oldfs, seg)
+#define stp_user_access_end(oldfs)
#endif /* _STAPDYN_LOC2C_RUNTIME_H_ */
diff --git a/runtime/linux/addr-map.c b/runtime/linux/addr-map.c
index 183f5e819..a0b8919f5 100644
--- a/runtime/linux/addr-map.c
+++ b/runtime/linux/addr-map.c
@@ -36,7 +36,13 @@
#endif
-#ifdef STAPCONF_ACCESS_OK_2ARGS
+#ifdef STAPCONF_ASM_ACCESS_OK
+/* access_ok() is designed for user context only and calling it from
+ * perf event etc probe contexts would result in kernel warnings
+ * on debug kernels. see upstream kernel commit d319f34456 for more details.
+ */
+#define stp_access_ok(x,y,z) __access_ok(y,z)
+#elif defined(STAPCONF_ACCESS_OK_2ARGS)
#define stp_access_ok(x,y,z) access_ok(y,z)
#else
#define stp_access_ok(x,y,z) access_ok(x,y,z)
@@ -64,7 +70,7 @@ stp_nmi_uaccess_okay(void)
#endif
}
-static int
+static __always_inline int
lookup_bad_addr_user(const int type, const unsigned long addr, const size_t size)
{
/* Is this a valid memory access? */
@@ -116,18 +122,17 @@ lookup_bad_addr_user(const int type, const unsigned long addr, const size_t size
return 0;
}
-static int
+static __always_inline int
lookup_bad_addr(const int type, const unsigned long addr, const size_t size,
const stp_mm_segment_t seg)
{
-#ifndef STAPCONF_SET_FS
/* Is this a valid memory access?
*
* PR26811: Since kernel 5.10 due to set_fs() removal we need to
* distinguish kernel- and user-space addresses when asking this
* question.
*/
- if (!MM_SEG_IS_KERNEL(seg))
+ if (stp_is_user_ds(seg))
return lookup_bad_addr_user(type, addr, size);
/* For kernel addr, skip the in_task() portion of the address checks: */
@@ -151,10 +156,6 @@ lookup_bad_addr(const int type, const unsigned long addr, const size_t size,
#endif
return 0;
-#else
- /* XXX: On earlier kernels the same logic works for kernel-space: */
- return lookup_bad_addr_user(type, addr, size);
-#endif
}
#else /* PR12970 */
diff --git a/runtime/linux/autoconf-asm-access-ok.c b/runtime/linux/autoconf-asm-access-ok.c
new file mode 100644
index 000000000..1d0e61776
--- /dev/null
+++ b/runtime/linux/autoconf-asm-access-ok.c
@@ -0,0 +1,6 @@
+#include <linux/uaccess.h>
+
+bool foo(const void __user *ptr, size_t size)
+{
+ return __access_ok(ptr, size);
+}
diff --git a/runtime/linux/autoconf-asm-tlbflush-h.c b/runtime/linux/autoconf-asm-tlbflush-h.c
new file mode 100644
index 000000000..5e99c2d0d
--- /dev/null
+++ b/runtime/linux/autoconf-asm-tlbflush-h.c
@@ -0,0 +1 @@
+#include <asm/tlbflush.h>
diff --git a/runtime/linux/autoconf-user-access-begin-2-args.c b/runtime/linux/autoconf-user-access-begin-2-args.c
new file mode 100644
index 000000000..2ecc5af9e
--- /dev/null
+++ b/runtime/linux/autoconf-user-access-begin-2-args.c
@@ -0,0 +1,6 @@
+#include <linux/uaccess.h>
+
+bool foo(const void __user *ptr, size_t size)
+{
+ return user_access_begin(ptr, size);
+}
diff --git a/runtime/linux/autoconf-user-access-begin-3-args.c b/runtime/linux/autoconf-user-access-begin-3-args.c
new file mode 100644
index 000000000..2942ddf08
--- /dev/null
+++ b/runtime/linux/autoconf-user-access-begin-3-args.c
@@ -0,0 +1,6 @@
+#include <linux/uaccess.h>
+
+bool foo(int mode, const void __user *ptr, size_t size)
+{
+ return user_access_begin(mode, ptr, size);
+}
diff --git a/runtime/linux/autoconf-user-access-end.c b/runtime/linux/autoconf-user-access-end.c
new file mode 100644
index 000000000..4eafb8222
--- /dev/null
+++ b/runtime/linux/autoconf-user-access-end.c
@@ -0,0 +1,6 @@
+#include <linux/uaccess.h>
+
+void foo(void)
+{
+ user_access_end();
+}
diff --git a/runtime/linux/copy.c b/runtime/linux/copy.c
index 9a96d5951..c9862b06e 100644
--- a/runtime/linux/copy.c
+++ b/runtime/linux/copy.c
@@ -62,10 +62,9 @@ static long _stp_strncpy_from_user(char *dst, const char __user *src,
static unsigned long _stp_copy_from_user(char *dst, const char __user *src, unsigned long count)
{
if (count) {
-#ifdef STAPCONF_SET_FS
- mm_segment_t _oldfs = get_fs();
- set_fs(USER_DS);
-#endif
+ stp_mm_segment_t oldfs;
+ if (!stp_user_access_begin(VERIFY_READ, src, count, &oldfs, STP_USER_DS))
+ goto done;
pagefault_disable();
if (!lookup_bad_addr(VERIFY_READ, (const unsigned long)src, count, STP_USER_DS))
count = __copy_from_user_inatomic(dst, src, count);
@@ -75,10 +74,9 @@ static unsigned long _stp_copy_from_user(char *dst, const char __user *src, unsi
* can't trust 'count' to be reasonable. */
count = -EFAULT;
pagefault_enable();
-#ifdef STAPCONF_SET_FS
- set_fs(_oldfs);
-#endif
+ stp_user_access_end(oldfs);
}
+done:
return count;
}
diff --git a/runtime/linux/loc2c-runtime.h b/runtime/linux/loc2c-runtime.h
index f6c9accec..80252ad80 100644
--- a/runtime/linux/loc2c-runtime.h
+++ b/runtime/linux/loc2c-runtime.h
@@ -18,6 +18,9 @@
#include <asm/uaccess.h>
#endif
#include <linux/types.h>
+#ifdef STAPCONF_ASM_TLBFLUSH_H
+#include <asm/tlbflush.h>
+#endif
#define intptr_t long
#define uintptr_t unsigned long
@@ -446,7 +449,7 @@ typedef typeof(&copy_to_kernel_nofault) copy_to_kernel_nofault_fn;
* that the kernel doesn't pagefault while reading.
*/
-static inline int __stp_deref_nocheck_(u64 *pv, size_t size,
+static __always_inline int __stp_deref_nocheck_(u64 *pv, size_t size,
void *addr, stp_mm_segment_t seg)
{
u64 v = 0;
@@ -492,6 +495,77 @@ static inline int __stp_deref_nocheck_(u64 *pv, size_t size,
})
+static __always_inline bool
+stp_is_user_ds(stp_mm_segment_t seg)
+{
+#ifdef STP_NUMERICAL_DS
+ return seg == STP_USER_DS;
+#else
+ stp_mm_segment_t user_seg = STP_USER_DS;
+ return memcmp(&seg, &user_seg, sizeof(stp_mm_segment_t)) == 0;
+#endif
+}
+
+static __always_inline bool
+stp_user_access_begin(int type, const void *ptr, size_t size,
+ stp_mm_segment_t *oldfs, stp_mm_segment_t seg)
+{
+#ifdef STAPCONF_SET_FS
+ *oldfs = get_fs();
+ set_fs(seg);
+ return 1;
+#elif defined(STAPCONF_USER_ACCESS_BEGIN_3_ARGS)
+ return user_access_begin(type, ptr, size);
+#elif defined(STAPCONF_USER_ACCESS_BEGIN_2_ARGS)
+ return user_access_begin(ptr, size);
+#else
+ /* for very old kernels */
+ return 1;
+#endif
+}
+
+static __always_inline void
+stp_user_access_end(stp_mm_segment_t oldfs)
+{
+#ifdef STAPCONF_SET_FS
+ set_fs(oldfs);
+#elif defined(STAPCONF_USER_ACCESS_END)
+ user_access_end();
+#else
+ /* do nothing for very old kernels */
+#endif
+}
+
+static __always_inline bool
+stp_mem_access_begin(int type, const void *ptr, size_t size,
+ stp_mm_segment_t *oldfs, stp_mm_segment_t seg, bool *is_user_ptr)
+{
+ bool is_user = stp_is_user_ds(seg);
+ *is_user_ptr = is_user;
+ if (is_user)
+ return stp_user_access_begin(type, ptr, size, oldfs, seg);
+
+ /* for kernel memory accesses */
+
+#ifdef STAPCONF_SET_FS
+ *oldfs = get_fs();
+#endif
+ return 1;
+}
+
+static __always_inline void
+stp_mem_access_end(stp_mm_segment_t oldfs, bool is_user)
+{
+ if (is_user)
+ return stp_user_access_end(oldfs);
+
+ /* not for userland */
+
+#ifdef STAPCONF_SET_FS
+ set_fs(oldfs);
+#endif
+}
+
/*
* _stp_lookup_bad_addr(): safely verify an address
*
@@ -507,7 +581,7 @@ static inline int __stp_deref_nocheck_(u64 *pv, size_t size,
* memory.
*/
-static inline int _stp_lookup_bad_addr_(int type, size_t size,
+static __always_inline int _stp_lookup_bad_addr_(int type, size_t size,
uintptr_t addr, stp_mm_segment_t seg)
{
int bad;
@@ -545,25 +619,34 @@ static inline int _stp_lookup_bad_addr_(int type, size_t size,
* pagefault when trying to read the memory.
*/
-static inline int _stp_deref_nofault_(u64 *pv, size_t size, void *addr,
+static __always_inline int _stp_deref_nofault_(u64 *pv, size_t size, void *addr,
stp_mm_segment_t seg)
{
int r = -EFAULT;
-#ifdef STAPCONF_SET_FS
- mm_segment_t oldfs = get_fs();
-
- set_fs(seg);
-#endif
pagefault_disable();
if (lookup_bad_addr(VERIFY_READ, (uintptr_t)addr, size, seg))
r = -EFAULT;
- else
+ else {
+ stp_mm_segment_t oldfs;
+ bool is_user;
+
+ /* NB just to suppress -Werror=maybe-uninitialized warnings from older
+ * GCC like version 8.3 with -O3 */
+#ifdef STP_NUMERICAL_DS
+ oldfs = 0;
+#else
+ memset(&oldfs, 0, sizeof(stp_mm_segment_t));
+#endif
+
+ if (!stp_mem_access_begin(VERIFY_READ, addr, size, &oldfs, seg, &is_user))
+ goto done;
+
r = __stp_deref_nocheck_(pv, size, addr, seg);
- pagefault_enable();
-#ifdef STAPCONF_SET_FS
- set_fs(oldfs);
-#endif
+ stp_mem_access_end(oldfs, is_user);
+ }
+done:
+ pagefault_enable();
return r;
}
@@ -691,21 +774,22 @@ static inline int _stp_store_deref_(size_t size, void *addr, u64 v,
stp_mm_segment_t seg)
{
int r;
-#ifdef STAPCONF_SET_FS
- mm_segment_t oldfs = get_fs();
-
- set_fs(seg);
-#endif
+ stp_mm_segment_t oldfs;
+ bool is_user;
pagefault_disable();
- if (lookup_bad_addr(VERIFY_READ, (uintptr_t)addr, size, seg))
+ if (lookup_bad_addr(VERIFY_WRITE, (uintptr_t)addr, size, seg)) {
r = -EFAULT;
- else
+ } else {
+ if (!stp_mem_access_begin(VERIFY_WRITE, addr, size, &oldfs, seg, &is_user)) {
+ r = -EFAULT;
+ goto done;
+ }
+
r = __stp_store_deref_nocheck_(size, addr, v, seg);
+ stp_mem_access_end(oldfs, is_user);
+ }
+done:
pagefault_enable();
-#ifdef STAPCONF_SET_FS
- set_fs(oldfs);
-#endif
-
return r;
}
@@ -857,16 +941,16 @@ static inline long _stp_deref_string_nofault(char *dst, const char *addr,
{
int err = 0;
size_t i = 0;
-#ifdef STAPCONF_SET_FS
- mm_segment_t oldfs = get_fs();
-
- set_fs(seg);
-#endif
+ stp_mm_segment_t oldfs;
+ bool is_user;
pagefault_disable();
if (lookup_bad_addr(VERIFY_READ, (uintptr_t)addr, len, seg))
err = 1;
else
{
+ if (!stp_mem_access_begin(VERIFY_READ, addr, len, &oldfs, seg, &is_user))
+ goto done;
+
/* Reduce len by 1 to leave room for '\0' terminator. */
for (i = 0; i + 1 < len; ++i)
{
@@ -879,12 +963,11 @@ static inline long _stp_deref_string_nofault(char *dst, const char *addr,
}
if (!err && dst)
*dst = '\0';
+
+ stp_mem_access_end(oldfs, is_user);
}
+done:
pagefault_enable();
-#ifdef STAPCONF_SET_FS
- set_fs(oldfs);
-#endif
-
return err ? -EFAULT : i;
}
@@ -915,15 +998,19 @@ static inline int _stp_store_deref_string_(char *src, void *addr, size_t len,
{
int err = 0;
size_t i;
-#ifdef STAPCONF_SET_FS
- mm_segment_t oldfs = get_fs();
+ stp_mm_segment_t oldfs;
+ bool is_user;
- set_fs(seg);
-#endif
pagefault_disable();
- if (lookup_bad_addr(VERIFY_WRITE, (uintptr_t)addr, len, seg))
+ if (lookup_bad_addr(VERIFY_WRITE, (uintptr_t)addr, len, seg)) {
err = 1;
- else if (len > 0)
+ goto done;
+ }
+
+ if (!stp_mem_access_begin(VERIFY_WRITE, addr, len, &oldfs, seg, &is_user))
+ goto done;
+
+ if (len > 0)
{
for (i = 0; i < len - 1; ++i)
{
@@ -933,12 +1020,12 @@ static inline int _stp_store_deref_string_(char *src, void *addr, size_t len,
}
err = __stp_put_either('\0', (u8 *)addr + i, seg);
}
- out:
+
+out:
+ stp_mem_access_end(oldfs, is_user);
+
+ done:
pagefault_enable();
-#ifdef STAPCONF_SET_FS
- set_fs(oldfs);
-#endif
-
return err;
}
@@ -955,28 +1042,30 @@ static inline int _stp_store_deref_bin_string_(char *src, void *addr, size_t len
{
int err = 0;
size_t i;
-#ifdef STAPCONF_SET_FS
- mm_segment_t oldfs = get_fs();
- set_fs(seg);
-#endif
pagefault_disable();
if (lookup_bad_addr(VERIFY_WRITE, (uintptr_t)addr, len, seg))
err = 1;
else if (len > 0)
{
+ stp_mm_segment_t oldfs;
+ bool is_user;
+
+ if (!stp_mem_access_begin(VERIFY_READ, addr, len, &oldfs, seg, &is_user))
+ goto done;
+
for (i = 0; i < len; ++i)
{
err = __stp_put_either(*src++, (u8 *)addr + i, seg);
if (err)
goto out;
}
+
+ out:
+ stp_mem_access_end(oldfs, is_user);
}
- out:
+done:
pagefault_enable();
-#ifdef STAPCONF_SET_FS
- set_fs(oldfs);
-#endif
return err;
}
diff --git a/runtime/linux/runtime.h b/runtime/linux/runtime.h
index eb26461b0..d3ff9b566 100644
--- a/runtime/linux/runtime.h
+++ b/runtime/linux/runtime.h
@@ -59,6 +59,7 @@
#define stp_mm_segment_t unsigned long
#define STP_KERNEL_DS 0
#define STP_USER_DS 1
+#define STP_NUMERICAL_DS 1
#define MM_SEG_IS_KERNEL(seg) ((seg)==STP_KERNEL_DS)
// Required for kernel write operations:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment