-
-
Save agentzh/0628614ac04f773c6662d225100924be to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
commit dce3fd8814d3666996b7b22b121f36f953176daf | |
Author: Yichun Zhang (agentzh) <yichun@openresty.com> | |
Date: Tue May 16 17:17:07 2023 -0700 | |
PR30456: kernel 5.18+ removed set_fs() and we should use the new replacement, user_access_begin()/user_access_end() for userland memory accesses. | |
Alas. user_access_begin() in newer kernels also performs access_ok() itself. | |
We pay a small price of duplicate access_ok() checks here for now. | |
We also use __access_ok() instead of access_ok() to avoid warnings on debug kernels. | |
We use __always_inline instead of inline for the C functions which might | |
be within the regions with UACCESS enabled (to please objtool's checker). | |
diff --git a/buildrun.cxx b/buildrun.cxx | |
index bff852c14..d783a0393 100644 | |
--- a/buildrun.cxx | |
+++ b/buildrun.cxx | |
@@ -614,6 +614,11 @@ compile_pass (systemtap_session& s) | |
// used by userland memory reads | |
output_autoconf(s, o, cs, "autoconf-nmi-uaccess-okay.c", "STAPCONF_NMI_UACCESS_OKAY", NULL); | |
+ output_autoconf(s, o, cs, "autoconf-asm-access-ok.c", "STAPCONF_ASM_ACCESS_OK", NULL); | |
+ output_autoconf(s, o, cs, "autoconf-user-access-begin-2-args.c", "STAPCONF_USER_ACCESS_BEGIN_2_ARGS", NULL); | |
+ output_autoconf(s, o, cs, "autoconf-user-access-begin-3-args.c", "STAPCONF_USER_ACCESS_BEGIN_3_ARGS", NULL); | |
+ output_autoconf(s, o, cs, "autoconf-user-access-end.c", "STAPCONF_USER_ACCESS_END", NULL); | |
+ output_autoconf(s, o, cs, "autoconf-asm-tlbflush-h.c", "STAPCONF_ASM_TLBFLUSH_H", NULL); | |
// used by runtime/uprobe-inode.c | |
output_either_exportconf(s, o2, "uprobe_register", "register_uprobe", | |
diff --git a/runtime/dyninst/loc2c-runtime.h b/runtime/dyninst/loc2c-runtime.h | |
index 978c143ec..f46ead198 100644 | |
--- a/runtime/dyninst/loc2c-runtime.h | |
+++ b/runtime/dyninst/loc2c-runtime.h | |
@@ -77,5 +77,9 @@ | |
#define kderef uderef | |
#define store_kderef store_uderef | |
+#define stp_mem_access_begin(type, ptr, size, oldfs, seg, is_user_ptr) | |
+#define stp_mem_access_end(oldfs, is_user) | |
+#define stp_user_access_begin(type, ptr, size, oldfs, seg) | |
+#define stp_user_access_end(oldfs) | |
#endif /* _STAPDYN_LOC2C_RUNTIME_H_ */ | |
diff --git a/runtime/linux/addr-map.c b/runtime/linux/addr-map.c | |
index 183f5e819..a0b8919f5 100644 | |
--- a/runtime/linux/addr-map.c | |
+++ b/runtime/linux/addr-map.c | |
@@ -36,7 +36,13 @@ | |
#endif | |
-#ifdef STAPCONF_ACCESS_OK_2ARGS | |
+#ifdef STAPCONF_ASM_ACCESS_OK | |
+/* access_ok() is designed for user context only and calling it from | |
+ * perf event etc probe contexts would result in kernel warnings | |
+ * on debug kernels. see upstream kernel commit d319f34456 for more details. | |
+ */ | |
+#define stp_access_ok(x,y,z) __access_ok(y,z) | |
+#elif defined(STAPCONF_ACCESS_OK_2ARGS) | |
#define stp_access_ok(x,y,z) access_ok(y,z) | |
#else | |
#define stp_access_ok(x,y,z) access_ok(x,y,z) | |
@@ -64,7 +70,7 @@ stp_nmi_uaccess_okay(void) | |
#endif | |
} | |
-static int | |
+static __always_inline int | |
lookup_bad_addr_user(const int type, const unsigned long addr, const size_t size) | |
{ | |
/* Is this a valid memory access? */ | |
@@ -116,18 +122,17 @@ lookup_bad_addr_user(const int type, const unsigned long addr, const size_t size | |
return 0; | |
} | |
-static int | |
+static __always_inline int | |
lookup_bad_addr(const int type, const unsigned long addr, const size_t size, | |
const stp_mm_segment_t seg) | |
{ | |
-#ifndef STAPCONF_SET_FS | |
/* Is this a valid memory access? | |
* | |
* PR26811: Since kernel 5.10 due to set_fs() removal we need to | |
* distinguish kernel- and user-space addresses when asking this | |
* question. | |
*/ | |
- if (!MM_SEG_IS_KERNEL(seg)) | |
+ if (stp_is_user_ds(seg)) | |
return lookup_bad_addr_user(type, addr, size); | |
/* For kernel addr, skip the in_task() portion of the address checks: */ | |
@@ -151,10 +156,6 @@ lookup_bad_addr(const int type, const unsigned long addr, const size_t size, | |
#endif | |
return 0; | |
-#else | |
- /* XXX: On earlier kernels the same logic works for kernel-space: */ | |
- return lookup_bad_addr_user(type, addr, size); | |
-#endif | |
} | |
#else /* PR12970 */ | |
diff --git a/runtime/linux/autoconf-asm-access-ok.c b/runtime/linux/autoconf-asm-access-ok.c | |
new file mode 100644 | |
index 000000000..1d0e61776 | |
--- /dev/null | |
+++ b/runtime/linux/autoconf-asm-access-ok.c | |
@@ -0,0 +1,6 @@ | |
+#include <linux/uaccess.h> | |
+ | |
+bool foo(const void __user *ptr, size_t size) | |
+{ | |
+ return __access_ok(ptr, size); | |
+} | |
diff --git a/runtime/linux/autoconf-asm-tlbflush-h.c b/runtime/linux/autoconf-asm-tlbflush-h.c | |
new file mode 100644 | |
index 000000000..5e99c2d0d | |
--- /dev/null | |
+++ b/runtime/linux/autoconf-asm-tlbflush-h.c | |
@@ -0,0 +1 @@ | |
+#include <asm/tlbflush.h> | |
diff --git a/runtime/linux/autoconf-user-access-begin-2-args.c b/runtime/linux/autoconf-user-access-begin-2-args.c | |
new file mode 100644 | |
index 000000000..2ecc5af9e | |
--- /dev/null | |
+++ b/runtime/linux/autoconf-user-access-begin-2-args.c | |
@@ -0,0 +1,6 @@ | |
+#include <linux/uaccess.h> | |
+ | |
+bool foo(const void __user *ptr, size_t size) | |
+{ | |
+ return user_access_begin(ptr, size); | |
+} | |
diff --git a/runtime/linux/autoconf-user-access-begin-3-args.c b/runtime/linux/autoconf-user-access-begin-3-args.c | |
new file mode 100644 | |
index 000000000..2942ddf08 | |
--- /dev/null | |
+++ b/runtime/linux/autoconf-user-access-begin-3-args.c | |
@@ -0,0 +1,6 @@ | |
+#include <linux/uaccess.h> | |
+ | |
+bool foo(int mode, const void __user *ptr, size_t size) | |
+{ | |
+ return user_access_begin(mode, ptr, size); | |
+} | |
diff --git a/runtime/linux/autoconf-user-access-end.c b/runtime/linux/autoconf-user-access-end.c | |
new file mode 100644 | |
index 000000000..4eafb8222 | |
--- /dev/null | |
+++ b/runtime/linux/autoconf-user-access-end.c | |
@@ -0,0 +1,6 @@ | |
+#include <linux/uaccess.h> | |
+ | |
+void foo(void) | |
+{ | |
+ user_access_end(); | |
+} | |
diff --git a/runtime/linux/copy.c b/runtime/linux/copy.c | |
index 9a96d5951..c9862b06e 100644 | |
--- a/runtime/linux/copy.c | |
+++ b/runtime/linux/copy.c | |
@@ -62,10 +62,9 @@ static long _stp_strncpy_from_user(char *dst, const char __user *src, | |
static unsigned long _stp_copy_from_user(char *dst, const char __user *src, unsigned long count) | |
{ | |
if (count) { | |
-#ifdef STAPCONF_SET_FS | |
- mm_segment_t _oldfs = get_fs(); | |
- set_fs(USER_DS); | |
-#endif | |
+ stp_mm_segment_t oldfs; | |
+ if (!stp_user_access_begin(VERIFY_READ, src, count, &oldfs, STP_USER_DS)) | |
+ goto done; | |
pagefault_disable(); | |
if (!lookup_bad_addr(VERIFY_READ, (const unsigned long)src, count, STP_USER_DS)) | |
count = __copy_from_user_inatomic(dst, src, count); | |
@@ -75,10 +74,9 @@ static unsigned long _stp_copy_from_user(char *dst, const char __user *src, unsi | |
* can't trust 'count' to be reasonable. */ | |
count = -EFAULT; | |
pagefault_enable(); | |
-#ifdef STAPCONF_SET_FS | |
- set_fs(_oldfs); | |
-#endif | |
+ stp_user_access_end(oldfs); | |
} | |
+done: | |
return count; | |
} | |
diff --git a/runtime/linux/loc2c-runtime.h b/runtime/linux/loc2c-runtime.h | |
index f6c9accec..80252ad80 100644 | |
--- a/runtime/linux/loc2c-runtime.h | |
+++ b/runtime/linux/loc2c-runtime.h | |
@@ -18,6 +18,9 @@ | |
#include <asm/uaccess.h> | |
#endif | |
#include <linux/types.h> | |
+#ifdef STAPCONF_ASM_TLBFLUSH_H | |
+#include <asm/tlbflush.h> | |
+#endif | |
#define intptr_t long | |
#define uintptr_t unsigned long | |
@@ -446,7 +449,7 @@ typedef typeof(©_to_kernel_nofault) copy_to_kernel_nofault_fn; | |
* that the kernel doesn't pagefault while reading. | |
*/ | |
-static inline int __stp_deref_nocheck_(u64 *pv, size_t size, | |
+static __always_inline int __stp_deref_nocheck_(u64 *pv, size_t size, | |
void *addr, stp_mm_segment_t seg) | |
{ | |
u64 v = 0; | |
@@ -492,6 +495,77 @@ static inline int __stp_deref_nocheck_(u64 *pv, size_t size, | |
}) | |
+static __always_inline bool | |
+stp_is_user_ds(stp_mm_segment_t seg) | |
+{ | |
+#ifdef STP_NUMERICAL_DS | |
+ return seg == STP_USER_DS; | |
+#else | |
+ stp_mm_segment_t user_seg = STP_USER_DS; | |
+ return memcmp(&seg, &user_seg, sizeof(stp_mm_segment_t)) == 0; | |
+#endif | |
+} | |
+ | |
+static __always_inline bool | |
+stp_user_access_begin(int type, const void *ptr, size_t size, | |
+ stp_mm_segment_t *oldfs, stp_mm_segment_t seg) | |
+{ | |
+#ifdef STAPCONF_SET_FS | |
+ *oldfs = get_fs(); | |
+ set_fs(seg); | |
+ return 1; | |
+#elif defined(STAPCONF_USER_ACCESS_BEGIN_3_ARGS) | |
+ return user_access_begin(type, ptr, size); | |
+#elif defined(STAPCONF_USER_ACCESS_BEGIN_2_ARGS) | |
+ return user_access_begin(ptr, size); | |
+#else | |
+ /* for very old kernels */ | |
+ return 1; | |
+#endif | |
+} | |
+ | |
+static __always_inline void | |
+stp_user_access_end(stp_mm_segment_t oldfs) | |
+{ | |
+#ifdef STAPCONF_SET_FS | |
+ set_fs(oldfs); | |
+#elif defined(STAPCONF_USER_ACCESS_END) | |
+ user_access_end(); | |
+#else | |
+ /* do nothing for very old kernels */ | |
+#endif | |
+} | |
+ | |
+static __always_inline bool | |
+stp_mem_access_begin(int type, const void *ptr, size_t size, | |
+ stp_mm_segment_t *oldfs, stp_mm_segment_t seg, bool *is_user_ptr) | |
+{ | |
+ bool is_user = stp_is_user_ds(seg); | |
+ *is_user_ptr = is_user; | |
+ if (is_user) | |
+ return stp_user_access_begin(type, ptr, size, oldfs, seg); | |
+ | |
+ /* for kernel memory accesses */ | |
+ | |
+#ifdef STAPCONF_SET_FS | |
+ *oldfs = get_fs(); | |
+#endif | |
+ return 1; | |
+} | |
+ | |
+static __always_inline void | |
+stp_mem_access_end(stp_mm_segment_t oldfs, bool is_user) | |
+{ | |
+ if (is_user) | |
+ return stp_user_access_end(oldfs); | |
+ | |
+ /* not for userland */ | |
+ | |
+#ifdef STAPCONF_SET_FS | |
+ set_fs(oldfs); | |
+#endif | |
+} | |
+ | |
/* | |
* _stp_lookup_bad_addr(): safely verify an address | |
* | |
@@ -507,7 +581,7 @@ static inline int __stp_deref_nocheck_(u64 *pv, size_t size, | |
* memory. | |
*/ | |
-static inline int _stp_lookup_bad_addr_(int type, size_t size, | |
+static __always_inline int _stp_lookup_bad_addr_(int type, size_t size, | |
uintptr_t addr, stp_mm_segment_t seg) | |
{ | |
int bad; | |
@@ -545,25 +619,34 @@ static inline int _stp_lookup_bad_addr_(int type, size_t size, | |
* pagefault when trying to read the memory. | |
*/ | |
-static inline int _stp_deref_nofault_(u64 *pv, size_t size, void *addr, | |
+static __always_inline int _stp_deref_nofault_(u64 *pv, size_t size, void *addr, | |
stp_mm_segment_t seg) | |
{ | |
int r = -EFAULT; | |
-#ifdef STAPCONF_SET_FS | |
- mm_segment_t oldfs = get_fs(); | |
- | |
- set_fs(seg); | |
-#endif | |
pagefault_disable(); | |
if (lookup_bad_addr(VERIFY_READ, (uintptr_t)addr, size, seg)) | |
r = -EFAULT; | |
- else | |
+ else { | |
+ stp_mm_segment_t oldfs; | |
+ bool is_user; | |
+ | |
+ /* NB just to suppress -Werror=maybe-uninitialized warnings from older | |
+ * GCC like version 8.3 with -O3 */ | |
+#ifdef STP_NUMERICAL_DS | |
+ oldfs = 0; | |
+#else | |
+ memset(&oldfs, 0, sizeof(stp_mm_segment_t)); | |
+#endif | |
+ | |
+ if (!stp_mem_access_begin(VERIFY_READ, addr, size, &oldfs, seg, &is_user)) | |
+ goto done; | |
+ | |
r = __stp_deref_nocheck_(pv, size, addr, seg); | |
- pagefault_enable(); | |
-#ifdef STAPCONF_SET_FS | |
- set_fs(oldfs); | |
-#endif | |
+ stp_mem_access_end(oldfs, is_user); | |
+ } | |
+done: | |
+ pagefault_enable(); | |
return r; | |
} | |
@@ -691,21 +774,22 @@ static inline int _stp_store_deref_(size_t size, void *addr, u64 v, | |
stp_mm_segment_t seg) | |
{ | |
int r; | |
-#ifdef STAPCONF_SET_FS | |
- mm_segment_t oldfs = get_fs(); | |
- | |
- set_fs(seg); | |
-#endif | |
+ stp_mm_segment_t oldfs; | |
+ bool is_user; | |
pagefault_disable(); | |
- if (lookup_bad_addr(VERIFY_READ, (uintptr_t)addr, size, seg)) | |
+ if (lookup_bad_addr(VERIFY_WRITE, (uintptr_t)addr, size, seg)) { | |
r = -EFAULT; | |
- else | |
+ } else { | |
+ if (!stp_mem_access_begin(VERIFY_WRITE, addr, size, &oldfs, seg, &is_user)) { | |
+ r = -EFAULT; | |
+ goto done; | |
+ } | |
+ | |
r = __stp_store_deref_nocheck_(size, addr, v, seg); | |
+ stp_mem_access_end(oldfs, is_user); | |
+ } | |
+done: | |
pagefault_enable(); | |
-#ifdef STAPCONF_SET_FS | |
- set_fs(oldfs); | |
-#endif | |
- | |
return r; | |
} | |
@@ -857,16 +941,16 @@ static inline long _stp_deref_string_nofault(char *dst, const char *addr, | |
{ | |
int err = 0; | |
size_t i = 0; | |
-#ifdef STAPCONF_SET_FS | |
- mm_segment_t oldfs = get_fs(); | |
- | |
- set_fs(seg); | |
-#endif | |
+ stp_mm_segment_t oldfs; | |
+ bool is_user; | |
pagefault_disable(); | |
if (lookup_bad_addr(VERIFY_READ, (uintptr_t)addr, len, seg)) | |
err = 1; | |
else | |
{ | |
+ if (!stp_mem_access_begin(VERIFY_READ, addr, len, &oldfs, seg, &is_user)) | |
+ goto done; | |
+ | |
/* Reduce len by 1 to leave room for '\0' terminator. */ | |
for (i = 0; i + 1 < len; ++i) | |
{ | |
@@ -879,12 +963,11 @@ static inline long _stp_deref_string_nofault(char *dst, const char *addr, | |
} | |
if (!err && dst) | |
*dst = '\0'; | |
+ | |
+ stp_mem_access_end(oldfs, is_user); | |
} | |
+done: | |
pagefault_enable(); | |
-#ifdef STAPCONF_SET_FS | |
- set_fs(oldfs); | |
-#endif | |
- | |
return err ? -EFAULT : i; | |
} | |
@@ -915,15 +998,19 @@ static inline int _stp_store_deref_string_(char *src, void *addr, size_t len, | |
{ | |
int err = 0; | |
size_t i; | |
-#ifdef STAPCONF_SET_FS | |
- mm_segment_t oldfs = get_fs(); | |
+ stp_mm_segment_t oldfs; | |
+ bool is_user; | |
- set_fs(seg); | |
-#endif | |
pagefault_disable(); | |
- if (lookup_bad_addr(VERIFY_WRITE, (uintptr_t)addr, len, seg)) | |
+ if (lookup_bad_addr(VERIFY_WRITE, (uintptr_t)addr, len, seg)) { | |
err = 1; | |
- else if (len > 0) | |
+ goto done; | |
+ } | |
+ | |
+ if (!stp_mem_access_begin(VERIFY_WRITE, addr, len, &oldfs, seg, &is_user)) | |
+ goto done; | |
+ | |
+ if (len > 0) | |
{ | |
for (i = 0; i < len - 1; ++i) | |
{ | |
@@ -933,12 +1020,12 @@ static inline int _stp_store_deref_string_(char *src, void *addr, size_t len, | |
} | |
err = __stp_put_either('\0', (u8 *)addr + i, seg); | |
} | |
- out: | |
+ | |
+out: | |
+ stp_mem_access_end(oldfs, is_user); | |
+ | |
+ done: | |
pagefault_enable(); | |
-#ifdef STAPCONF_SET_FS | |
- set_fs(oldfs); | |
-#endif | |
- | |
return err; | |
} | |
@@ -955,28 +1042,30 @@ static inline int _stp_store_deref_bin_string_(char *src, void *addr, size_t len | |
{ | |
int err = 0; | |
size_t i; | |
-#ifdef STAPCONF_SET_FS | |
- mm_segment_t oldfs = get_fs(); | |
- set_fs(seg); | |
-#endif | |
pagefault_disable(); | |
if (lookup_bad_addr(VERIFY_WRITE, (uintptr_t)addr, len, seg)) | |
err = 1; | |
else if (len > 0) | |
{ | |
+ stp_mm_segment_t oldfs; | |
+ bool is_user; | |
+ | |
+ if (!stp_mem_access_begin(VERIFY_READ, addr, len, &oldfs, seg, &is_user)) | |
+ goto done; | |
+ | |
for (i = 0; i < len; ++i) | |
{ | |
err = __stp_put_either(*src++, (u8 *)addr + i, seg); | |
if (err) | |
goto out; | |
} | |
+ | |
+ out: | |
+ stp_mem_access_end(oldfs, is_user); | |
} | |
- out: | |
+done: | |
pagefault_enable(); | |
-#ifdef STAPCONF_SET_FS | |
- set_fs(oldfs); | |
-#endif | |
return err; | |
} | |
diff --git a/runtime/linux/runtime.h b/runtime/linux/runtime.h | |
index eb26461b0..d3ff9b566 100644 | |
--- a/runtime/linux/runtime.h | |
+++ b/runtime/linux/runtime.h | |
@@ -59,6 +59,7 @@ | |
#define stp_mm_segment_t unsigned long | |
#define STP_KERNEL_DS 0 | |
#define STP_USER_DS 1 | |
+#define STP_NUMERICAL_DS 1 | |
#define MM_SEG_IS_KERNEL(seg) ((seg)==STP_KERNEL_DS) | |
// Required for kernel write operations: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment