kerneltoast/0001-task_finder_vma-rewrite-using-RCU-to-fix-performance.patch Secret

## 0001-task_finder_vma-rewrite-using-RCU-to-fix-performance.patch
From f2de1ee1d341bb7ba6288474fff7ad2ca4de9a1d Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@openresty.com>
Date: Tue, 20 Oct 2020 08:55:24 -0700
Subject: [PATCH] task_finder_vma: rewrite using RCU to fix performance issues

The use of a single global rwlock to protect this file's hash table
results in significantly degraded performance when there are many
processes using the vma tracker in flight. A lot of time is spent
spinning on the rwlock when this happens.

To remedy this, make the hash table RCU safe so we'll never block upon
reading a hash list.

Another change made to improve performance is using the modulo of the
jhash rather than extracting what we need via bitwise AND, to improve
the distribution of hashes across the hash table. The task pointers
themselves are hashed now instead of their PID for reliability, since
PIDs are not a stable anchor point to a task struct.

While we're at it, clean up the rest of this file to bring it up to
current Linux kernel coding standards as well.
---
 runtime/linux/runtime.h   |   2 +
 runtime/task_finder_vma.c | 448 +++++++++++++++++++-------------------
 2 files changed, 223 insertions(+), 227 deletions(-)

diff --git a/runtime/linux/runtime.h b/runtime/linux/runtime.h
index 07850c345..8e1ae2c42 100644
--- a/runtime/linux/runtime.h
+++ b/runtime/linux/runtime.h
@@ -89,9 +89,11 @@ static void _stp_exit(void);

 #ifdef STAPCONF_HLIST_4ARGS
 #define stap_hlist_for_each_entry(a,b,c,d) hlist_for_each_entry(a,b,c,d)
+#define stap_hlist_for_each_entry_rcu(a,b,c,d) hlist_for_each_entry_rcu(a,b,c,d)
 #define stap_hlist_for_each_entry_safe(a,b,c,d,e) hlist_for_each_entry_safe(a,b,c,d,e)
 #else
 #define stap_hlist_for_each_entry(a,b,c,d) (void) b; hlist_for_each_entry(a,c,d)
+#define stap_hlist_for_each_entry_rcu(a,b,c,d) (void) b; hlist_for_each_entry_rcu(a,c,d)
 #define stap_hlist_for_each_entry_safe(a,b,c,d,e) (void) b; hlist_for_each_entry_safe(a,c,d,e)
 #endif

diff --git a/runtime/task_finder_vma.c b/runtime/task_finder_vma.c
index b485e5b99..4dc2dc07d 100644
--- a/runtime/task_finder_vma.c
+++ b/runtime/task_finder_vma.c
@@ -10,14 +10,44 @@

 #include "stp_helper_lock.h"

-// __stp_tf_vma_lock protects the hash table.
-// Documentation/spinlocks.txt suggest we can be a bit more clever
-// if we guarantee that in interrupt context we only read, not write
-// the datastructures. We should never change the hash table or the
-// contents in interrupt context (which should only ever call
-// stap_find_vma_map_info for getting stored vma info). So we might
-// want to look into that if this seems a bottleneck.
-static STP_DEFINE_RWLOCK(__stp_tf_vma_lock);
+/* atomic_try_cmpxchg and atomic_fetch_add_unless fallback from newer kernels */
+#ifndef atomic_try_cmpxchg
+static __always_inline bool
+atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+	int r, o = *old;
+	r = atomic_cmpxchg(v, o, new);
+	if (unlikely(r != o))
+		*old = r;
+	return likely(r == o);
+}
+#define atomic_try_cmpxchg atomic_try_cmpxchg
+#endif
+
+#ifndef atomic_fetch_add_unless
+/**
+ * atomic_fetch_add_unless - add unless the number is already a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as @v was not already @u.
+ * Returns original value of @v
+ */
+static __always_inline int
+atomic_fetch_add_unless(atomic_t *v, int a, int u)
+{
+	int c = atomic_read(v);
+
+	do {
+		if (unlikely(c == u))
+			break;
+	} while (!atomic_try_cmpxchg(v, &c, c + a));
+
+	return c;
+}
+#define atomic_fetch_add_unless atomic_fetch_add_unless
+#endif

 #define __STP_TF_HASH_BITS 4
 #define __STP_TF_TABLE_SIZE (1 << __STP_TF_HASH_BITS)
@@ -28,20 +58,26 @@ static STP_DEFINE_RWLOCK(__stp_tf_vma_lock);
 #error "gimme a little more TASK_FINDER_VMA_ENTRY_PATHLEN"
 #endif

-
 struct __stp_tf_vma_entry {
 	struct hlist_node hlist;

-	pid_t pid;
+	struct rcu_head rcu;
+	atomic_t refcount;
+	struct task_struct *tsk;
 	unsigned long vm_start;
 	unsigned long vm_end;
-        char path[TASK_FINDER_VMA_ENTRY_PATHLEN]; /* mmpath name, if known */
+	char path[TASK_FINDER_VMA_ENTRY_PATHLEN]; /* mmpath name, if known */

 	// User data (possibly stp_module)
 	void *user;
 };

-static struct hlist_head *__stp_tf_vma_map;
+struct __stp_tf_vma_bucket {
+	struct hlist_head head;
+	spinlock_t lock;
+};
+
+static struct __stp_tf_vma_bucket *__stp_tf_vma_map;

 // __stp_tf_vma_new_entry(): Returns an newly allocated or NULL.
 // Must only be called from user context.
@@ -51,22 +87,38 @@ static struct __stp_tf_vma_entry *
 __stp_tf_vma_new_entry(void)
 {
 	struct __stp_tf_vma_entry *entry;
-	size_t size = sizeof (struct __stp_tf_vma_entry);
+	// Alloc using kmalloc rather than the stp variant. This way the RCU
+	// callback freeing the entries will not depend on using a function
+	// within this module to free the allocated memory (_stp_kfree), which
+	// lets us omit a costly rcu_barrier operation upon module unload.
 #ifdef CONFIG_UTRACE
-	entry = (struct __stp_tf_vma_entry *) _stp_kmalloc_gfp(size,
-                                                         STP_ALLOC_SLEEP_FLAGS);
+	entry = kmalloc(sizeof(*entry), STP_ALLOC_SLEEP_FLAGS);
 #else
-	entry = (struct __stp_tf_vma_entry *) _stp_kmalloc_gfp(size,
-                                                               STP_ALLOC_FLAGS);
+	entry = kmalloc(sizeof(*entry), STP_ALLOC_FLAGS);
 #endif
 	return entry;
 }

-// __stp_tf_vma_release_entry(): Frees an entry.
+// __stp_tf_vma_put_entry(): Put a specified number of references on the entry.
 static void
-__stp_tf_vma_release_entry(struct __stp_tf_vma_entry *entry)
+__stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket,
+		       struct __stp_tf_vma_entry *entry, int count)
 {
-	_stp_kfree (entry);
+	unsigned long flags;
+	int old;
+
+	// We must atomically subtract only if the refcount is non-zero, as well
+	// as check to see if the new refcount is zero, in which case we should
+	// free the entry.
+	old = atomic_fetch_add_unless(&entry->refcount, -count, 0);
+	if (old - count)
+		return;
+
+	spin_lock_irqsave(&bucket->lock, flags);
+	hlist_del_rcu(&entry->hlist);
+	spin_unlock_irqrestore(&bucket->lock, flags);
+
+	kfree_rcu(entry, rcu);
 }

 // stap_initialize_vma_map():  Initialize the free list.  Grabs the
@@ -77,145 +129,127 @@ __stp_tf_vma_release_entry(struct __stp_tf_vma_entry *entry)
 static int
 stap_initialize_vma_map(void)
 {
-	size_t size = sizeof(struct hlist_head) * __STP_TF_TABLE_SIZE;
-	struct hlist_head *map = (struct hlist_head *) _stp_kzalloc_gfp(size,
-							STP_ALLOC_SLEEP_FLAGS);
-	if (map == NULL)
+	struct __stp_tf_vma_bucket *buckets;
+	int i;
+
+	buckets = _stp_kmalloc_gfp(sizeof(*buckets) * __STP_TF_TABLE_SIZE,
+				   STP_ALLOC_SLEEP_FLAGS);
+	if (!buckets)
 		return -ENOMEM;

-	__stp_tf_vma_map = map;
+	for (i = 0; i < __STP_TF_TABLE_SIZE; i++) {
+		struct __stp_tf_vma_bucket *bucket = &buckets[i];
+
+		INIT_HLIST_HEAD(&bucket->head);
+		spin_lock_init(&bucket->lock);
+	}
+
+	__stp_tf_vma_map = buckets;
 	return 0;
 }

 // stap_destroy_vma_map(): Unconditionally destroys vma entries.
-// Nothing should be using it anymore. Doesn't lock anything and just
-// frees all items.
+// Nothing should be using it anymore.
 static void
 stap_destroy_vma_map(void)
 {
-	if (__stp_tf_vma_map != NULL) {
-		int i;
-		for (i = 0; i < __STP_TF_TABLE_SIZE; i++) {
-			struct hlist_head *head = &__stp_tf_vma_map[i];
-			struct hlist_node *node;
-			struct hlist_node *n;
-			struct __stp_tf_vma_entry *entry = NULL;
-
-			if (hlist_empty(head))
-				continue;
-
-		        stap_hlist_for_each_entry_safe(entry, node, n, head, hlist) {
-				hlist_del(&entry->hlist);
-				__stp_tf_vma_release_entry(entry);
-			}
-		}
-		_stp_kfree(__stp_tf_vma_map);
+	int i;
+
+	if (!__stp_tf_vma_map)
+		return;
+
+	for (i = 0; i < __STP_TF_TABLE_SIZE; i++) {
+		struct __stp_tf_vma_bucket *bucket = &__stp_tf_vma_map[i];
+		struct __stp_tf_vma_entry *entry;
+		struct hlist_node *node;
+
+		rcu_read_lock();
+		stap_hlist_for_each_entry_rcu(entry, node, &bucket->head, hlist)
+			__stp_tf_vma_put_entry(bucket, entry, 1);
+		rcu_read_unlock();
 	}
-}

+	_stp_kfree(__stp_tf_vma_map);
+}

 // __stp_tf_vma_map_hash(): Compute the vma map hash.
 static inline u32
 __stp_tf_vma_map_hash(struct task_struct *tsk)
 {
-    return (jhash_1word(tsk->pid, 0) & (__STP_TF_TABLE_SIZE - 1));
-}
-
-// Get vma_entry if the vma is present in the vma map hash table.
-// Returns NULL if not present.  The __stp_tf_vma_lock must be read locked
-// before calling this function.
-static struct __stp_tf_vma_entry *
-__stp_tf_get_vma_map_entry_internal(struct task_struct *tsk,
-				    unsigned long vm_start)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct __stp_tf_vma_entry *entry;
+	u32 hash = jhash2((u32 *)&tsk, sizeof(tsk) / sizeof(u32), 0);

-	head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
-	stap_hlist_for_each_entry(entry, node, head, hlist) {
-		if (tsk->pid == entry->pid
-		    && vm_start == entry->vm_start) {
-			return entry;
-		}
-	}
-	return NULL;
+	return hash % (__STP_TF_TABLE_SIZE - 1);
 }

-// Get vma_entry if the vma with the given vm_end is present in the vma map
-// hash table for the tsk.  Returns NULL if not present.
-// The __stp_tf_vma_lock must be read locked before calling this function.
-static struct __stp_tf_vma_entry *
-__stp_tf_get_vma_map_entry_end_internal(struct task_struct *tsk,
-					unsigned long vm_end)
+// __stp_tf_vma_bucket(): Get the bucket that should contain the task.
+static inline struct __stp_tf_vma_bucket *
+__stp_tf_get_vma_bucket(struct task_struct *tsk)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct __stp_tf_vma_entry *entry;
-
-	head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
-	stap_hlist_for_each_entry(entry, node, head, hlist) {
-		if (tsk->pid == entry->pid
-		    && vm_end == entry->vm_end) {
-			return entry;
-		}
-	}
-	return NULL;
+	return &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
 }

+// Get vma entry if the vma is present in the vma map hash table satisfying the
+// given condition.
+#define __stp_tf_get_vma_map(bucket, tsk, acquire, condition) \
+({										\
+	struct __stp_tf_vma_entry *entry, *found = NULL;			\
+	struct hlist_node *node;						\
+										\
+	rcu_read_lock();							\
+	stap_hlist_for_each_entry_rcu(entry, node, &bucket->head, hlist) {	\
+		if (entry->tsk == tsk && (condition) &&				\
+		    atomic_add_unless(&entry->refcount, acquire, 0)) {		\
+			found = entry;						\
+			break;							\
+		}								\
+	}									\
+	rcu_read_unlock();							\
+										\
+	found;									\
+})

 // Add the vma info to the vma map hash table.
 // Caller is responsible for name lifetime.
 // Can allocate memory, so needs to be called
 // only from user context.
 static int
-stap_add_vma_map_info(struct task_struct *tsk,
-		      unsigned long vm_start, unsigned long vm_end,
-		      const char *path, void *user)
+stap_add_vma_map_info(struct task_struct *tsk, unsigned long vm_start,
+		      unsigned long vm_end, const char *path, void *user)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
+	struct __stp_tf_vma_bucket *bucket = __stp_tf_get_vma_bucket(tsk);
 	struct __stp_tf_vma_entry *entry;
-	struct __stp_tf_vma_entry *new_entry;
+	struct hlist_node *node;
 	unsigned long flags;
+	size_t path_len;

-	// Take a write lock, since we are most likely going to write
-	// after reading. But reserve a new entry first outside the lock.
-	new_entry = __stp_tf_vma_new_entry();
-	stp_write_lock_irqsave(&__stp_tf_vma_lock, flags);
-	entry = __stp_tf_get_vma_map_entry_internal(tsk, vm_start);
-	if (entry != NULL) {
-		stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
-		if (new_entry)
-			__stp_tf_vma_release_entry(new_entry);
-		return -EBUSY;	/* Already there */
-	}
+	// Check if the entry already exists
+	if (__stp_tf_get_vma_map(bucket, tsk, 0, entry->vm_start == vm_start))
+		return -EEXIST;

-	if (!new_entry) {
-		stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+	entry = __stp_tf_vma_new_entry();
+	if (!entry)
 		return -ENOMEM;
-	}

-	// Fill in the info
-	entry = new_entry;
-	entry->pid = tsk->pid;
+	// Fill in the new entry
+	entry->refcount = (atomic_t)ATOMIC_INIT(1);
+	entry->tsk = tsk;
 	entry->vm_start = vm_start;
 	entry->vm_end = vm_end;
-        if (strlen(path) >= TASK_FINDER_VMA_ENTRY_PATHLEN-3)
-          {
-            strlcpy (entry->path, "...", TASK_FINDER_VMA_ENTRY_PATHLEN);
-            strlcpy (entry->path+3, &path[strlen(path)-TASK_FINDER_VMA_ENTRY_PATHLEN+4],
-                     TASK_FINDER_VMA_ENTRY_PATHLEN-3);
-          }
-        else
-          {
-            strlcpy (entry->path, path, TASK_FINDER_VMA_ENTRY_PATHLEN);
-          }
 	entry->user = user;

-	head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
-	hlist_add_head(&entry->hlist, head);
-	stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+	path_len = strlen(path);
+	if (path_len >= TASK_FINDER_VMA_ENTRY_PATHLEN - 3) {
+		strlcpy(entry->path, "...", TASK_FINDER_VMA_ENTRY_PATHLEN);
+		strlcpy(entry->path + 3,
+			&path[path_len - TASK_FINDER_VMA_ENTRY_PATHLEN + 4],
+			TASK_FINDER_VMA_ENTRY_PATHLEN - 3);
+	} else {
+		strlcpy(entry->path, path, TASK_FINDER_VMA_ENTRY_PATHLEN);
+	}
+
+	spin_lock_irqsave(&bucket->lock, flags);
+	hlist_add_head_rcu(&entry->hlist, &bucket->head);
+	spin_unlock_irqrestore(&bucket->lock, flags);
 	return 0;
 }

@@ -224,26 +258,19 @@ stap_add_vma_map_info(struct task_struct *tsk,
 // task. Returns zero on success, -ESRCH if no existing matching entry could
 // be found.
 static int
-stap_extend_vma_map_info(struct task_struct *tsk,
-			 unsigned long vm_start, unsigned long vm_end)
+stap_extend_vma_map_info(struct task_struct *tsk, unsigned long vm_start,
+			 unsigned long vm_end)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
+	struct __stp_tf_vma_bucket *bucket = __stp_tf_get_vma_bucket(tsk);
 	struct __stp_tf_vma_entry *entry;

-	unsigned long flags;
-	int res = -ESRCH; // Entry not there or doesn't match.
-
-	// Take a write lock, since we are most likely going to write
-	// to the entry after reading, if its vm_end matches our vm_start.
-	stp_write_lock_irqsave(&__stp_tf_vma_lock, flags);
-	entry = __stp_tf_get_vma_map_entry_end_internal(tsk, vm_start);
-	if (entry != NULL) {
-		entry->vm_end = vm_end;
-		res = 0;
-	}
-	stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
-	return res;
+	entry = __stp_tf_get_vma_map(bucket, tsk, 1, entry->vm_end == vm_start);
+	if (!entry)
+		return -ESRCH;
+
+	entry->vm_end = vm_end;
+	__stp_tf_vma_put_entry(bucket, entry, 1);
+	return 0;
 }


@@ -252,128 +279,95 @@ stap_extend_vma_map_info(struct task_struct *tsk,
 static int
 stap_remove_vma_map_info(struct task_struct *tsk, unsigned long vm_start)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
+	struct __stp_tf_vma_bucket *bucket = __stp_tf_get_vma_bucket(tsk);
 	struct __stp_tf_vma_entry *entry;
-	int rc = -ESRCH;

-	// Take a write lock since we are most likely going to delete
-	// after reading.
-	unsigned long flags;
-	stp_write_lock_irqsave(&__stp_tf_vma_lock, flags);
-	entry = __stp_tf_get_vma_map_entry_internal(tsk, vm_start);
-	if (entry != NULL) {
-		hlist_del(&entry->hlist);
-		__stp_tf_vma_release_entry(entry);
-                rc = 0;
-	}
-	stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
-	return rc;
+	entry = __stp_tf_get_vma_map(bucket, tsk, 1, entry->vm_start == vm_start);
+	if (!entry)
+		return -ESRCH;
+
+	// Put two references: one for the reference we just got,
+	// and another to free the entry.
+	__stp_tf_vma_put_entry(bucket, entry, 2);
+	return 0;
 }

 // Finds vma info if the vma is present in the vma map hash table for
 // a given task and address (between vm_start and vm_end).
-// Returns -ESRCH if not present.  The __stp_tf_vma_lock must *not* be
-// locked before calling this function.
+// Returns -ESRCH if not present.
 static int
 stap_find_vma_map_info(struct task_struct *tsk, unsigned long addr,
 		       unsigned long *vm_start, unsigned long *vm_end,
 		       const char **path, void **user)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
+	struct __stp_tf_vma_bucket *bucket;
 	struct __stp_tf_vma_entry *entry;
-	struct __stp_tf_vma_entry *found_entry = NULL;
-	int rc = -ESRCH;
-	unsigned long flags;
-
-	if (__stp_tf_vma_map == NULL)
-		return rc;

-	stp_read_lock_irqsave(&__stp_tf_vma_lock, flags);
-	head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
-	stap_hlist_for_each_entry(entry, node, head, hlist) {
-		if (tsk->pid == entry->pid
-		    && addr >= entry->vm_start
-		    && addr < entry->vm_end) {
-			found_entry = entry;
-			break;
-		}
-	}
-	if (found_entry != NULL) {
-		if (vm_start != NULL)
-			*vm_start = found_entry->vm_start;
-		if (vm_end != NULL)
-			*vm_end = found_entry->vm_end;
-		if (path != NULL)
-			*path = found_entry->path;
-		if (user != NULL)
-			*user = found_entry->user;
-		rc = 0;
-	}
-	stp_read_unlock_irqrestore(&__stp_tf_vma_lock, flags);
-	return rc;
+	if (!__stp_tf_vma_map)
+		return -ESRCH;
+
+	bucket = __stp_tf_get_vma_bucket(tsk);
+	entry = __stp_tf_get_vma_map(bucket, tsk, 1, addr >= entry->vm_start &&
+				     addr < entry->vm_end);
+	if (!entry)
+		return -ESRCH;
+
+	if (vm_start)
+		*vm_start = entry->vm_start;
+	if (vm_end)
+		*vm_end = entry->vm_end;
+	if (path)
+		*path = entry->path;
+	if (user)
+		*user = entry->user;
+
+	__stp_tf_vma_put_entry(bucket, entry, 1);
+	return 0;
 }

 // Finds vma info if the vma is present in the vma map hash table for
 // a given task with the given user handle.
-// Returns -ESRCH if not present.  The __stp_tf_vma_lock must *not* be
-// locked before calling this function.
+// Returns -ESRCH if not present.
 static int
 stap_find_vma_map_info_user(struct task_struct *tsk, void *user,
 			    unsigned long *vm_start, unsigned long *vm_end,
 			    const char **path)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
+	struct __stp_tf_vma_bucket *bucket;
 	struct __stp_tf_vma_entry *entry;
-	struct __stp_tf_vma_entry *found_entry = NULL;
-	int rc = -ESRCH;
-	unsigned long flags;

-	if (__stp_tf_vma_map == NULL)
-		return rc;
+	if (!__stp_tf_vma_map)
+		return -ESRCH;

-	stp_read_lock_irqsave(&__stp_tf_vma_lock, flags);
-	head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
-	stap_hlist_for_each_entry(entry, node, head, hlist) {
-		if (tsk->pid == entry->pid
-		    && user == entry->user) {
-			found_entry = entry;
-			break;
-		}
-	}
-	if (found_entry != NULL) {
-		if (vm_start != NULL)
-			*vm_start = found_entry->vm_start;
-		if (vm_end != NULL)
-			*vm_end = found_entry->vm_end;
-		if (path != NULL)
-			*path = found_entry->path;
-		rc = 0;
-	}
-	stp_read_unlock_irqrestore(&__stp_tf_vma_lock, flags);
-	return rc;
+	bucket = __stp_tf_get_vma_bucket(tsk);
+	entry = __stp_tf_get_vma_map(bucket, tsk, 1, entry->user == user);
+	if (!entry)
+		return -ESRCH;
+
+	if (vm_start)
+		*vm_start = entry->vm_start;
+	if (vm_end)
+		*vm_end = entry->vm_end;
+	if (path)
+		*path = entry->path;
+
+	__stp_tf_vma_put_entry(bucket, entry, 1);
+	return 0;
 }

 static int
 stap_drop_vma_maps(struct task_struct *tsk)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct hlist_node *n;
+	struct __stp_tf_vma_bucket *bucket = __stp_tf_get_vma_bucket(tsk);
 	struct __stp_tf_vma_entry *entry;
+	struct hlist_node *node;

-	unsigned long flags;
-	stp_write_lock_irqsave(&__stp_tf_vma_lock, flags);
-	head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
-        stap_hlist_for_each_entry_safe(entry, node, n, head, hlist) {
-            if (tsk->pid == entry->pid) {
-		    hlist_del(&entry->hlist);
-		    __stp_tf_vma_release_entry(entry);
-            }
-        }
-	stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+	rcu_read_lock();
+	stap_hlist_for_each_entry_rcu(entry, node, &bucket->head, hlist) {
+		if (entry->tsk == tsk)
+			__stp_tf_vma_put_entry(bucket, entry, 1);
+	}
+	rcu_read_unlock();
 	return 0;
 }

--
2.28.0

## sum.diff
--- systemtap-normal-logs/systemtap.sum	2020-10-20 00:21:11.000000000 -0700
+++ systemtap-rcu-logs/systemtap.sum	2020-10-20 10:00:51.000000000 -0700
@@ -1,4 +1,4 @@
-Test run by root on Mon Oct 19 23:57:42 2020
+Test run by root on Tue Oct 20 09:38:08 2020
 Native configuration is x86_64-pc-linux-gnu

 		=== systemtap tests ===
@@ -171,7 +171,7 @@
 PASS: at_kderef shutdown and output
 PASS: at_uderef startup
 PASS: at_uderef load generation
-FAIL: at_uderef unexpected output
+PASS: at_uderef shutdown and output
 Running /home/sultan/systemtap/testsuite/systemtap.base/atomic.exp ...
 PASS: atomic1 expected error
 PASS: atomic2 expected error
@@ -281,7 +281,7 @@
 Running /home/sultan/systemtap/testsuite/systemtap.base/bz1074541.exp ...
 PASS: ./bz1074541
 Running /home/sultan/systemtap/testsuite/systemtap.base/bz1126645.exp ...
-FAIL: bz1126645 -p5 (40)
+FAIL: bz1126645 -p5 (2)
 Running /home/sultan/systemtap/testsuite/systemtap.base/bz1214176.exp ...
 PASS: stap -p4 -e { probe nfs.proc.read_done { println(server_ip) } }
 PASS: stap -p4 -e { probe nfs.proc.read_setup { println(count) } }
@@ -657,7 +657,7 @@
 PASS: dw_entry_value shutdown and output
 Running /home/sultan/systemtap/testsuite/systemtap.base/environment_sanity.exp ...
 Host: Linux localhost.localdomain 5.8.15-201.fc32.x86_64 #1 SMP Thu Oct 15 15:56:44 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
-Snapshot: version 4.4/0.181, commit release-4.3-85-g2f7e3794ac5b
+Snapshot: version 4.4/0.181, commit release-4.3-86-g146f2c2eb284
 GCC: 10.2.1 [gcc (GCC) 10.2.1 20200723 (Red Hat 10.2.1-1)]
 Distro: Fedora release 32 (Thirty Two)
 SElinux: Enforcing
@@ -2736,7 +2736,7 @@
 PASS: listing_mode_sanity (using arguments and script exited badly)
 FAIL: listing_mode_sanity (stap -l ** exited badly)
 Running /home/sultan/systemtap/testsuite/systemtap.base/lock-pushdown.exp ...
-FAIL: lock-pushdown
+PASS: lock-pushdown
 PASS: lock-pushdown -u
 PASS: lock-pushdown compat-4.3
 Running /home/sultan/systemtap/testsuite/systemtap.base/logical_and.exp ...
@@ -2836,10 +2836,10 @@
 Running /home/sultan/systemtap/testsuite/systemtap.base/optim.exp ...
 PASS: optim
 Running /home/sultan/systemtap/testsuite/systemtap.base/optim_stats.exp ...
-FAIL: TEST1  (5, -4)
-PASS: TEST2  (20, 46)
-FAIL: TEST3  (5, 0)
-PASS: TEST4  (20, 23)
+FAIL: TEST1  (5, -6)
+PASS: TEST2  (20, 45)
+FAIL: TEST3  (5, 4)
+PASS: TEST4  (20, 24)
 Running /home/sultan/systemtap/testsuite/systemtap.base/optim_voidstmt.exp ...
 PASS: optim_voidstmt startup
 PASS: optim_voidstmt load generation
@@ -2873,7 +2873,7 @@
 FAIL: perf process (0 - 0)
 PASS: perf counter
 FAIL: perf global (0 - 0)
-PASS: counter order  100000
+PASS: counter order  200000
 Running /home/sultan/systemtap/testsuite/systemtap.base/plt.exp ...
 FAIL: plt
 FAIL: plt library
@@ -3039,7 +3039,6 @@
 PASS: PROCFS_BPF value: goodbye
 PASS: PROCFS_BPF load generation
 PASS: PROCFS_BPF shutdown and output
-FAIL: PROCFS_BPF unexpected output (after passing output)
 Running /home/sultan/systemtap/testsuite/systemtap.base/procfs.exp ...
 PASS: PROCFS startup
 PASS: PROCFS read 100
@@ -3285,7 +3284,7 @@
 PASS: register_x86: TEST 3: 8-bit and 16-bit registers for ecx (kernel): stdout: string is "ecx = 0xffffffffbeefdead\nu ecx = 0xbeefdead\n\ncx = 0xffffffffffffdead\nu cx = 0xdead\n\ncl = 0xffffffffffffffad\nu cl = 0xad\n\nch = 0xffffffffffffffde\nu ch = 0xde\n"
 PASS: register_x86: TEST 3: 8-bit and 16-bit registers for ecx (kernel): exit code: string is "0"
 PASS: register_x86: TEST 4: 8-bit and 16-bit registers for eax (kernel): stdout: string is "ebx = 0xffffffffbeefdead\nu ebx = 0xbeefdead\n\nbx = 0xffffffffffffdead\nu bx = 0xdead\n\nbl = 0xffffffffffffffad\nu bl = 0xad\n\nbh = 0xffffffffffffffde\nu bh = 0xde\n"
-PASS: register_x86: TEST 4: 8-bit and 16-bit registers for eax (kernel): exit code: string is "0"
+FAIL: register_x86: TEST 4: 8-bit and 16-bit registers for eax (kernel): exit code: string should be "0", but got "1"
 Running /home/sultan/systemtap/testsuite/systemtap.base/remote.exp ...
 PASS: remote build direct:
 PASS: remote run direct:
@@ -3898,7 +3897,7 @@
 PASS: tracepoints - kernel.trace("bla:")
 PASS: tracepoints - kernel.trace("sched:")
 PASS: systemtap.base/tracepoints.stp -w
-PASS: systemtap.base/tracepoints2.stp
+FAIL: systemtap.base/tracepoints2.stp
 Running /home/sultan/systemtap/testsuite/systemtap.base/tracepoints_list.exp ...
 UNTESTED: tracepoints_list (no perf)
 Running /home/sultan/systemtap/testsuite/systemtap.base/tracescripts.exp ...
@@ -4125,7 +4124,7 @@
 PASS: temporary.stp
 PASS: unreachable.stp
 Running /home/sultan/systemtap/testsuite/systemtap.bpf/bpf.exp ...
-KFAIL: array.stp incorrect result (PRMS: BPF)
+PASS: array.stp
 PASS: array_in.stp
 PASS: array_preinit.stp
 PASS: assignment.stp
@@ -4154,14 +4153,14 @@
 PASS: if.stp
 PASS: increment1.stp
 PASS: increment2.stp
-FAIL: kprobes.stp incorrect result
+PASS: kprobes.stp
 PASS: ktime_get_ns.stp
 PASS: logging1.stp
 PASS: logging2.stp
 FAIL: next.stp incorrect result
 PASS: no_begin.stp
-KFAIL: no_begin_no_end.stp eof (startup) (PRMS: BPF)
-KFAIL: no_end.stp unexpected output (PRMS: BPF)
+PASS: no_begin_no_end.stp
+PASS: no_end.stp
 PASS: order.stp
 PASS: perf1.stp
 FAIL: perf2.stp eof (startup)
@@ -4175,7 +4174,7 @@
 PASS: sprintf.stp
 PASS: stat1.stp
 PASS: stat2.stp
-PASS: stat3.stp
+FAIL: stat3.stp incorrect result
 FAIL: string1.stp incorrect result
 PASS: string2.stp
 FAIL: string3.stp incorrect result
@@ -4242,7 +4241,7 @@
 PASS: stat1.stp
 PASS: stat2.stp
 PASS: stat3.stp
-FAIL: string1.stp incorrect result
+PASS: string1.stp
 PASS: string2.stp
 PASS: string3.stp
 PASS: string4.stp
@@ -4282,7 +4281,7 @@
 PASS: dtrace_vfork_exec3 - build success
 PASS: dtrace_vfork_exec4 startup
 PASS: dtrace_vfork_exec4 load generation
-FAIL: dtrace_vfork_exec4 unexpected output
+PASS: dtrace_vfork_exec4 shutdown and output
 Running /home/sultan/systemtap/testsuite/systemtap.clone/main_quiesce.exp ...
 PASS: main_quiesce - compiled main_quiesce.c
 PASS: main_quiesce startup
@@ -4483,7 +4482,7 @@
 PASS: systemtap.examples/io/nfs_func_users run
 PASS: systemtap.examples/io/slowvfs support
 PASS: systemtap.examples/io/slowvfs build
-PASS: systemtap.examples/io/slowvfs run
+FAIL: systemtap.examples/io/slowvfs run
 PASS: systemtap.examples/io/switchfile build
 PASS: systemtap.examples/io/switchfile run
 PASS: systemtap.examples/io/traceio build
@@ -5077,7 +5076,7 @@
 PASS: hrtimer_onthefly - otf_start_enabled_iter_4 (valid output)
 PASS: hrtimer_onthefly - otf_start_disabled_iter_5 (valid output)
 PASS: hrtimer_onthefly - otf_start_enabled_iter_5 (valid output)
-PASS: hrtimer_onthefly - otf_timer_10ms (valid output)
+FAIL: hrtimer_onthefly - otf_timer_10ms (invalid output)
 PASS: hrtimer_onthefly - otf_timer_5ms (valid output)
 PASS: hrtimer_onthefly - otf_stress_2ms_iter_50 (survived)
 PASS: hrtimer_onthefly - otf_stress_1ms_iter_50 (survived)
@@ -5097,8 +5096,8 @@
 PASS: kprobes_onthefly - otf_start_enabled_iter_3 (valid output)
 PASS: kprobes_onthefly - otf_start_disabled_iter_4 (valid output)
 PASS: kprobes_onthefly - otf_start_enabled_iter_4 (valid output)
-PASS: kprobes_onthefly - otf_start_disabled_iter_5 (valid output)
-PASS: kprobes_onthefly - otf_start_enabled_iter_5 (valid output)
+FAIL: kprobes_onthefly - otf_start_disabled_iter_5 (invalid output)
+FAIL: kprobes_onthefly - otf_start_enabled_iter_5 (invalid output)
 PASS: kprobes_onthefly - otf_timer_100ms (valid output)
 PASS: kprobes_onthefly - otf_timer_50ms (valid output)
 PASS: kprobes_onthefly - otf_timer_10ms (valid output)
@@ -8350,6 +8349,7 @@
 Running /home/sultan/systemtap/testsuite/systemtap.server/client.exp ...
 PASS: List existing online servers
 PASS: List existing online servers
+PASS: List existing online servers
 PASS: List existing trusted servers
 PASS: List existing signing servers
 PASS: List all existing servers
@@ -8640,7 +8640,7 @@
 FAIL: 64-bit preadv nd_syscall
 PASS: 64-bit prlimit nd_syscall
 PASS: 64-bit process_vm nd_syscall
-FAIL: 64-bit ptrace nd_syscall
+PASS: 64-bit ptrace nd_syscall
 FAIL: 64-bit pwrite nd_syscall
 FAIL: 64-bit pwritev nd_syscall
 FAIL: 64-bit quotactl nd_syscall
@@ -8753,7 +8753,7 @@
 PASS: 32-bit fanotify nd_syscall
 PASS: 32-bit flock nd_syscall
 PASS: 32-bit forkwait nd_syscall
-PASS: 32-bit futex nd_syscall
+FAIL: 32-bit futex nd_syscall
 PASS: 32-bit futimes nd_syscall
 PASS: 32-bit fxattr nd_syscall
 PASS: 32-bit getcpu nd_syscall
@@ -8859,7 +8859,7 @@
 PASS: 32-bit shutdown nd_syscall
 FAIL: 32-bit sigaltstack nd_syscall
 PASS: 32-bit sigmask nd_syscall
-PASS: 32-bit signal nd_syscall
+FAIL: 32-bit signal nd_syscall
 PASS: 32-bit signalfd nd_syscall
 PASS: 32-bit socket nd_syscall
 PASS: 32-bit socketpair nd_syscall
@@ -8868,7 +8868,7 @@
 FAIL: 32-bit swap nd_syscall
 PASS: 32-bit sync nd_syscall
 FAIL: 32-bit sync_file_range nd_syscall
-PASS: 32-bit syncfs nd_syscall
+FAIL: 32-bit syncfs nd_syscall
 PASS: 32-bit sysctl nd_syscall
 FAIL: 32-bit sysfs nd_syscall
 PASS: 32-bit sysinfo nd_syscall
@@ -8988,8 +8988,8 @@
 FAIL: 64-bit pread syscall
 FAIL: 64-bit preadv syscall
 PASS: 64-bit prlimit syscall
-PASS: 64-bit process_vm syscall
-PASS: 64-bit ptrace syscall
+FAIL: 64-bit process_vm syscall
+FAIL: 64-bit ptrace syscall
 FAIL: 64-bit pwrite syscall
 FAIL: 64-bit pwritev syscall
 FAIL: 64-bit quotactl syscall
@@ -9034,7 +9034,7 @@
 PASS: 64-bit shmat syscall
 PASS: 64-bit shmget syscall
 PASS: 64-bit shutdown syscall
-PASS: 64-bit sigaltstack syscall
+FAIL: 64-bit sigaltstack syscall
 UNSUPPORTED: 64-bit sigmask syscall not supported on this arch
 PASS: 64-bit signal syscall
 PASS: 64-bit signalfd syscall
@@ -9064,7 +9064,7 @@
 PASS: 64-bit unshare syscall
 FAIL: 64-bit uselib syscall
 PASS: 64-bit userfaultfd syscall
-FAIL: 64-bit vforkwait syscall
+PASS: 64-bit vforkwait syscall
 PASS: 64-bit vhangup syscall
 PASS: 64-bit wait syscall
 PASS: 64-bit wait4 syscall
@@ -9102,7 +9102,7 @@
 PASS: 32-bit fanotify syscall
 PASS: 32-bit flock syscall
 PASS: 32-bit forkwait syscall
-PASS: 32-bit futex syscall
+FAIL: 32-bit futex syscall
 PASS: 32-bit futimes syscall
 PASS: 32-bit fxattr syscall
 PASS: 32-bit getcpu syscall
@@ -9217,7 +9217,7 @@
 FAIL: 32-bit swap syscall
 PASS: 32-bit sync syscall
 FAIL: 32-bit sync_file_range syscall
-PASS: 32-bit syncfs syscall
+FAIL: 32-bit syncfs syscall
 PASS: 32-bit sysctl syscall
 FAIL: 32-bit sysfs syscall
 PASS: 32-bit sysinfo syscall
@@ -9380,7 +9380,7 @@
 PASS: 64-bit shmat tp_syscall
 PASS: 64-bit shmget tp_syscall
 PASS: 64-bit shutdown tp_syscall
-PASS: 64-bit sigaltstack tp_syscall
+FAIL: 64-bit sigaltstack tp_syscall
 UNSUPPORTED: 64-bit sigmask tp_syscall not supported on this arch
 PASS: 64-bit signal tp_syscall
 PASS: 64-bit signalfd tp_syscall
@@ -9391,7 +9391,7 @@
 FAIL: 64-bit swap tp_syscall
 PASS: 64-bit sync tp_syscall
 PASS: 64-bit sync_file_range tp_syscall
-FAIL: 64-bit syncfs tp_syscall
+PASS: 64-bit syncfs tp_syscall
 PASS: 64-bit sysctl tp_syscall
 FAIL: 64-bit sysfs tp_syscall
 PASS: 64-bit sysinfo tp_syscall
@@ -9554,7 +9554,7 @@
 PASS: 32-bit shutdown tp_syscall
 FAIL: 32-bit sigaltstack tp_syscall
 PASS: 32-bit sigmask tp_syscall
-PASS: 32-bit signal tp_syscall
+FAIL: 32-bit signal tp_syscall
 PASS: 32-bit signalfd tp_syscall
 PASS: 32-bit socket tp_syscall
 PASS: 32-bit socketpair tp_syscall
@@ -9592,7 +9592,7 @@
 PASS: pr16806 library compile
 PASS: pr16806 exe compile
 PASS: pr16806 ko compile
-PASS: pr16806
+FAIL: pr16806 staprun
 Running /home/sultan/systemtap/testsuite/systemtap.unprivileged/unprivileged_embedded_C.exp ...
 PASS: unprivileged embedded C: Obtain list tapset functions containing embedded C
 UNTESTED: unprivileged embedded C: no embedded C: container_of_task_rcu(long)
@@ -11057,7 +11057,7 @@
 PASS: unprivileged myproc: --unprivileged process(number).statement(string)
 PASS: unprivileged myproc: --privilege=stapusr process(number).statement(string).nearest
 PASS: unprivileged myproc: --unprivileged process(string).begin
-PASS: unprivileged myproc: --privilege=stapusr process(string).end
+FAIL: unprivileged myproc: --privilege=stapusr process(string).end
 PASS: unprivileged myproc: --unprivileged process(string).function(number)
 PASS: unprivileged myproc: --privilege=stapusr process(string).function(number).call
 KFAIL: unprivileged myproc: --unprivileged process(string).function(number).inline (PRMS: GCC)
@@ -12247,11 +12247,11 @@

 		=== systemtap Summary ===

-# of expected passes		9726
-# of unexpected failures	605
+# of expected passes		9719
+# of unexpected failures	615
 # of unexpected successes	8
 # of expected failures		344
 # of unknown successes		5
-# of known failures		101
+# of known failures		98
 # of untested testcases		845
 # of unsupported tests		19

## systemtap-normal-logs.tar.xz

      
    Raw
  

              systemtap-normal-logs.tar.xz
            
          
            View raw
        
    
## systemtap-rcu-logs.tar.xz

      
    Raw
  

              systemtap-rcu-logs.tar.xz
	From f2de1ee1d341bb7ba6288474fff7ad2ca4de9a1d Mon Sep 17 00:00:00 2001
	From: Sultan Alsawaf <sultan@openresty.com>
	Date: Tue, 20 Oct 2020 08:55:24 -0700
	Subject: [PATCH] task_finder_vma: rewrite using RCU to fix performance issues

	The use of a single global rwlock to protect this file's hash table
	results in significantly degraded performance when there are many
	processes using the vma tracker in flight. A lot of time is spent
	spinning on the rwlock when this happens.

	To remedy this, make the hash table RCU safe so we'll never block upon
	reading a hash list.

	Another change made to improve performance is using the modulo of the
	jhash rather than extracting what we need via bitwise AND, to improve
	the distribution of hashes across the hash table. The task pointers
	themselves are hashed now instead of their PID for reliability, since
	PIDs are not a stable anchor point to a task struct.

	While we're at it, clean up the rest of this file to bring it up to
	current Linux kernel coding standards as well.
	---
	runtime/linux/runtime.h \| 2 +
	runtime/task_finder_vma.c \| 448 +++++++++++++++++++-------------------
	2 files changed, 223 insertions(+), 227 deletions(-)

	diff --git a/runtime/linux/runtime.h b/runtime/linux/runtime.h
	index 07850c345..8e1ae2c42 100644
	--- a/runtime/linux/runtime.h
	+++ b/runtime/linux/runtime.h
	@@ -89,9 +89,11 @@ static void _stp_exit(void);

	#ifdef STAPCONF_HLIST_4ARGS
	#define stap_hlist_for_each_entry(a,b,c,d) hlist_for_each_entry(a,b,c,d)
	+#define stap_hlist_for_each_entry_rcu(a,b,c,d) hlist_for_each_entry_rcu(a,b,c,d)
	#define stap_hlist_for_each_entry_safe(a,b,c,d,e) hlist_for_each_entry_safe(a,b,c,d,e)
	#else
	#define stap_hlist_for_each_entry(a,b,c,d) (void) b; hlist_for_each_entry(a,c,d)
	+#define stap_hlist_for_each_entry_rcu(a,b,c,d) (void) b; hlist_for_each_entry_rcu(a,c,d)
	#define stap_hlist_for_each_entry_safe(a,b,c,d,e) (void) b; hlist_for_each_entry_safe(a,c,d,e)
	#endif

	diff --git a/runtime/task_finder_vma.c b/runtime/task_finder_vma.c
	index b485e5b99..4dc2dc07d 100644
	--- a/runtime/task_finder_vma.c
	+++ b/runtime/task_finder_vma.c
	@@ -10,14 +10,44 @@

	#include "stp_helper_lock.h"

	-// __stp_tf_vma_lock protects the hash table.
	-// Documentation/spinlocks.txt suggest we can be a bit more clever
	-// if we guarantee that in interrupt context we only read, not write
	-// the datastructures. We should never change the hash table or the
	-// contents in interrupt context (which should only ever call
	-// stap_find_vma_map_info for getting stored vma info). So we might
	-// want to look into that if this seems a bottleneck.
	-static STP_DEFINE_RWLOCK(__stp_tf_vma_lock);
	+/* atomic_try_cmpxchg and atomic_fetch_add_unless fallback from newer kernels */
	+#ifndef atomic_try_cmpxchg
	+static __always_inline bool
	+atomic_try_cmpxchg(atomic_t v, int old, int new)
	+{
	+ int r, o = *old;
	+ r = atomic_cmpxchg(v, o, new);
	+ if (unlikely(r != o))
	+ *old = r;
	+ return likely(r == o);
	+}
	+#define atomic_try_cmpxchg atomic_try_cmpxchg
	+#endif
	+
	+#ifndef atomic_fetch_add_unless
	+/**
	+ * atomic_fetch_add_unless - add unless the number is already a given value
	+ * @v: pointer of type atomic_t
	+ * @a: the amount to add to v...
	+ * @u: ...unless v is equal to u.
	+ *
	+ * Atomically adds @a to @v, so long as @v was not already @u.
	+ * Returns original value of @v
	+ */
	+static __always_inline int
	+atomic_fetch_add_unless(atomic_t *v, int a, int u)
	+{
	+ int c = atomic_read(v);
	+
	+ do {
	+ if (unlikely(c == u))
	+ break;
	+ } while (!atomic_try_cmpxchg(v, &c, c + a));
	+
	+ return c;
	+}
	+#define atomic_fetch_add_unless atomic_fetch_add_unless
	+#endif

	#define __STP_TF_HASH_BITS 4
	#define __STP_TF_TABLE_SIZE (1 << __STP_TF_HASH_BITS)
	@@ -28,20 +58,26 @@ static STP_DEFINE_RWLOCK(__stp_tf_vma_lock);
	#error "gimme a little more TASK_FINDER_VMA_ENTRY_PATHLEN"
	#endif

	-
	struct __stp_tf_vma_entry {
	struct hlist_node hlist;

	- pid_t pid;
	+ struct rcu_head rcu;
	+ atomic_t refcount;
	+ struct task_struct *tsk;
	unsigned long vm_start;
	unsigned long vm_end;
	- char path[TASK_FINDER_VMA_ENTRY_PATHLEN]; /* mmpath name, if known */
	+ char path[TASK_FINDER_VMA_ENTRY_PATHLEN]; /* mmpath name, if known */

	// User data (possibly stp_module)
	void *user;
	};

	-static struct hlist_head *__stp_tf_vma_map;
	+struct __stp_tf_vma_bucket {
	+ struct hlist_head head;
	+ spinlock_t lock;
	+};
	+
	+static struct __stp_tf_vma_bucket *__stp_tf_vma_map;

	// __stp_tf_vma_new_entry(): Returns an newly allocated or NULL.
	// Must only be called from user context.
	@@ -51,22 +87,38 @@ static struct __stp_tf_vma_entry *
	__stp_tf_vma_new_entry(void)
	{
	struct __stp_tf_vma_entry *entry;
	- size_t size = sizeof (struct __stp_tf_vma_entry);
	+ // Alloc using kmalloc rather than the stp variant. This way the RCU
	+ // callback freeing the entries will not depend on using a function
	+ // within this module to free the allocated memory (_stp_kfree), which
	+ // lets us omit a costly rcu_barrier operation upon module unload.
	#ifdef CONFIG_UTRACE
	- entry = (struct __stp_tf_vma_entry *) _stp_kmalloc_gfp(size,
	- STP_ALLOC_SLEEP_FLAGS);
	+ entry = kmalloc(sizeof(*entry), STP_ALLOC_SLEEP_FLAGS);
	#else
	- entry = (struct __stp_tf_vma_entry *) _stp_kmalloc_gfp(size,
	- STP_ALLOC_FLAGS);
	+ entry = kmalloc(sizeof(*entry), STP_ALLOC_FLAGS);
	#endif
	return entry;
	}

	-// __stp_tf_vma_release_entry(): Frees an entry.
	+// __stp_tf_vma_put_entry(): Put a specified number of references on the entry.
	static void
	-__stp_tf_vma_release_entry(struct __stp_tf_vma_entry *entry)
	+__stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket,
	+ struct __stp_tf_vma_entry *entry, int count)
	{
	- _stp_kfree (entry);
	+ unsigned long flags;
	+ int old;
	+
	+ // We must atomically subtract only if the refcount is non-zero, as well
	+ // as check to see if the new refcount is zero, in which case we should
	+ // free the entry.
	+ old = atomic_fetch_add_unless(&entry->refcount, -count, 0);
	+ if (old - count)
	+ return;
	+
	+ spin_lock_irqsave(&bucket->lock, flags);
	+ hlist_del_rcu(&entry->hlist);
	+ spin_unlock_irqrestore(&bucket->lock, flags);
	+
	+ kfree_rcu(entry, rcu);
	}

	// stap_initialize_vma_map(): Initialize the free list. Grabs the
	@@ -77,145 +129,127 @@ __stp_tf_vma_release_entry(struct __stp_tf_vma_entry *entry)
	static int
	stap_initialize_vma_map(void)
	{
	- size_t size = sizeof(struct hlist_head) * __STP_TF_TABLE_SIZE;
	- struct hlist_head map = (struct hlist_head ) _stp_kzalloc_gfp(size,
	- STP_ALLOC_SLEEP_FLAGS);
	- if (map == NULL)
	+ struct __stp_tf_vma_bucket *buckets;
	+ int i;
	+
	+ buckets = _stp_kmalloc_gfp(sizeof(buckets) __STP_TF_TABLE_SIZE,
	+ STP_ALLOC_SLEEP_FLAGS);
	+ if (!buckets)
	return -ENOMEM;

	- __stp_tf_vma_map = map;
	+ for (i = 0; i < __STP_TF_TABLE_SIZE; i++) {
	+ struct __stp_tf_vma_bucket *bucket = &buckets[i];
	+
	+ INIT_HLIST_HEAD(&bucket->head);
	+ spin_lock_init(&bucket->lock);
	+ }
	+
	+ __stp_tf_vma_map = buckets;
	return 0;
	}

	// stap_destroy_vma_map(): Unconditionally destroys vma entries.
	-// Nothing should be using it anymore. Doesn't lock anything and just
	-// frees all items.
	+// Nothing should be using it anymore.
	static void
	stap_destroy_vma_map(void)
	{
	- if (__stp_tf_vma_map != NULL) {
	- int i;
	- for (i = 0; i < __STP_TF_TABLE_SIZE; i++) {
	- struct hlist_head *head = &__stp_tf_vma_map[i];
	- struct hlist_node *node;
	- struct hlist_node *n;
	- struct __stp_tf_vma_entry *entry = NULL;
	-
	- if (hlist_empty(head))
	- continue;
	-
	- stap_hlist_for_each_entry_safe(entry, node, n, head, hlist) {
	- hlist_del(&entry->hlist);
	- __stp_tf_vma_release_entry(entry);
	- }
	- }
	- _stp_kfree(__stp_tf_vma_map);
	+ int i;
	+
	+ if (!__stp_tf_vma_map)
	+ return;
	+
	+ for (i = 0; i < __STP_TF_TABLE_SIZE; i++) {
	+ struct __stp_tf_vma_bucket *bucket = &__stp_tf_vma_map[i];
	+ struct __stp_tf_vma_entry *entry;
	+ struct hlist_node *node;
	+
	+ rcu_read_lock();
	+ stap_hlist_for_each_entry_rcu(entry, node, &bucket->head, hlist)
	+ __stp_tf_vma_put_entry(bucket, entry, 1);
	+ rcu_read_unlock();
	}
	-}

	+ _stp_kfree(__stp_tf_vma_map);
	+}

	// __stp_tf_vma_map_hash(): Compute the vma map hash.
	static inline u32
	__stp_tf_vma_map_hash(struct task_struct *tsk)
	{
	- return (jhash_1word(tsk->pid, 0) & (__STP_TF_TABLE_SIZE - 1));
	-}
	-
	-// Get vma_entry if the vma is present in the vma map hash table.
	-// Returns NULL if not present. The __stp_tf_vma_lock must be read locked
	-// before calling this function.
	-static struct __stp_tf_vma_entry *
	-__stp_tf_get_vma_map_entry_internal(struct task_struct *tsk,
	- unsigned long vm_start)
	-{
	- struct hlist_head *head;
	- struct hlist_node *node;
	- struct __stp_tf_vma_entry *entry;
	+ u32 hash = jhash2((u32 *)&tsk, sizeof(tsk) / sizeof(u32), 0);

	- head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
	- stap_hlist_for_each_entry(entry, node, head, hlist) {
	- if (tsk->pid == entry->pid
	- && vm_start == entry->vm_start) {
	- return entry;
	- }
	- }
	- return NULL;
	+ return hash % (__STP_TF_TABLE_SIZE - 1);
	}

	-// Get vma_entry if the vma with the given vm_end is present in the vma map
	-// hash table for the tsk. Returns NULL if not present.
	-// The __stp_tf_vma_lock must be read locked before calling this function.
	-static struct __stp_tf_vma_entry *
	-__stp_tf_get_vma_map_entry_end_internal(struct task_struct *tsk,
	- unsigned long vm_end)
	+// __stp_tf_vma_bucket(): Get the bucket that should contain the task.
	+static inline struct __stp_tf_vma_bucket *
	+__stp_tf_get_vma_bucket(struct task_struct *tsk)
	{
	- struct hlist_head *head;
	- struct hlist_node *node;
	- struct __stp_tf_vma_entry *entry;
	-
	- head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
	- stap_hlist_for_each_entry(entry, node, head, hlist) {
	- if (tsk->pid == entry->pid
	- && vm_end == entry->vm_end) {
	- return entry;
	- }
	- }
	- return NULL;
	+ return &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
	}

	+// Get vma entry if the vma is present in the vma map hash table satisfying the
	+// given condition.
	+#define __stp_tf_get_vma_map(bucket, tsk, acquire, condition) \
	+({ \
	+ struct __stp_tf_vma_entry entry, found = NULL; \
	+ struct hlist_node *node; \
	+ \
	+ rcu_read_lock(); \
	+ stap_hlist_for_each_entry_rcu(entry, node, &bucket->head, hlist) { \
	+ if (entry->tsk == tsk && (condition) && \
	+ atomic_add_unless(&entry->refcount, acquire, 0)) { \
	+ found = entry; \
	+ break; \
	+ } \
	+ } \
	+ rcu_read_unlock(); \
	+ \
	+ found; \
	+})

	// Add the vma info to the vma map hash table.
	// Caller is responsible for name lifetime.
	// Can allocate memory, so needs to be called
	// only from user context.
	static int
	-stap_add_vma_map_info(struct task_struct *tsk,
	- unsigned long vm_start, unsigned long vm_end,
	- const char path, void user)
	+stap_add_vma_map_info(struct task_struct *tsk, unsigned long vm_start,
	+ unsigned long vm_end, const char path, void user)
	{
	- struct hlist_head *head;
	- struct hlist_node *node;
	+ struct __stp_tf_vma_bucket *bucket = __stp_tf_get_vma_bucket(tsk);
	struct __stp_tf_vma_entry *entry;
	- struct __stp_tf_vma_entry *new_entry;
	+ struct hlist_node *node;
	unsigned long flags;
	+ size_t path_len;

	- // Take a write lock, since we are most likely going to write
	- // after reading. But reserve a new entry first outside the lock.
	- new_entry = __stp_tf_vma_new_entry();
	- stp_write_lock_irqsave(&__stp_tf_vma_lock, flags);
	- entry = __stp_tf_get_vma_map_entry_internal(tsk, vm_start);
	- if (entry != NULL) {
	- stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
	- if (new_entry)
	- __stp_tf_vma_release_entry(new_entry);
	- return -EBUSY; /* Already there */
	- }
	+ // Check if the entry already exists
	+ if (__stp_tf_get_vma_map(bucket, tsk, 0, entry->vm_start == vm_start))
	+ return -EEXIST;

	- if (!new_entry) {
	- stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
	+ entry = __stp_tf_vma_new_entry();
	+ if (!entry)
	return -ENOMEM;
	- }

	- // Fill in the info
	- entry = new_entry;
	- entry->pid = tsk->pid;
	+ // Fill in the new entry
	+ entry->refcount = (atomic_t)ATOMIC_INIT(1);
	+ entry->tsk = tsk;
	entry->vm_start = vm_start;
	entry->vm_end = vm_end;
	- if (strlen(path) >= TASK_FINDER_VMA_ENTRY_PATHLEN-3)
	- {
	- strlcpy (entry->path, "...", TASK_FINDER_VMA_ENTRY_PATHLEN);
	- strlcpy (entry->path+3, &path[strlen(path)-TASK_FINDER_VMA_ENTRY_PATHLEN+4],
	- TASK_FINDER_VMA_ENTRY_PATHLEN-3);
	- }
	- else
	- {
	- strlcpy (entry->path, path, TASK_FINDER_VMA_ENTRY_PATHLEN);
	- }
	entry->user = user;

	- head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
	- hlist_add_head(&entry->hlist, head);
	- stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
	+ path_len = strlen(path);
	+ if (path_len >= TASK_FINDER_VMA_ENTRY_PATHLEN - 3) {
	+ strlcpy(entry->path, "...", TASK_FINDER_VMA_ENTRY_PATHLEN);
	+ strlcpy(entry->path + 3,
	+ &path[path_len - TASK_FINDER_VMA_ENTRY_PATHLEN + 4],
	+ TASK_FINDER_VMA_ENTRY_PATHLEN - 3);
	+ } else {
	+ strlcpy(entry->path, path, TASK_FINDER_VMA_ENTRY_PATHLEN);
	+ }
	+
	+ spin_lock_irqsave(&bucket->lock, flags);
	+ hlist_add_head_rcu(&entry->hlist, &bucket->head);
	+ spin_unlock_irqrestore(&bucket->lock, flags);
	return 0;
	}

	@@ -224,26 +258,19 @@ stap_add_vma_map_info(struct task_struct *tsk,
	// task. Returns zero on success, -ESRCH if no existing matching entry could
	// be found.
	static int
	-stap_extend_vma_map_info(struct task_struct *tsk,
	- unsigned long vm_start, unsigned long vm_end)
	+stap_extend_vma_map_info(struct task_struct *tsk, unsigned long vm_start,
	+ unsigned long vm_end)
	{
	- struct hlist_head *head;
	- struct hlist_node *node;
	+ struct __stp_tf_vma_bucket *bucket = __stp_tf_get_vma_bucket(tsk);
	struct __stp_tf_vma_entry *entry;

	- unsigned long flags;
	- int res = -ESRCH; // Entry not there or doesn't match.
	-
	- // Take a write lock, since we are most likely going to write
	- // to the entry after reading, if its vm_end matches our vm_start.
	- stp_write_lock_irqsave(&__stp_tf_vma_lock, flags);
	- entry = __stp_tf_get_vma_map_entry_end_internal(tsk, vm_start);
	- if (entry != NULL) {
	- entry->vm_end = vm_end;
	- res = 0;
	- }
	- stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
	- return res;
	+ entry = __stp_tf_get_vma_map(bucket, tsk, 1, entry->vm_end == vm_start);
	+ if (!entry)
	+ return -ESRCH;
	+
	+ entry->vm_end = vm_end;
	+ __stp_tf_vma_put_entry(bucket, entry, 1);
	+ return 0;
	}


	@@ -252,128 +279,95 @@ stap_extend_vma_map_info(struct task_struct *tsk,
	static int
	stap_remove_vma_map_info(struct task_struct *tsk, unsigned long vm_start)
	{
	- struct hlist_head *head;
	- struct hlist_node *node;
	+ struct __stp_tf_vma_bucket *bucket = __stp_tf_get_vma_bucket(tsk);
	struct __stp_tf_vma_entry *entry;
	- int rc = -ESRCH;

	- // Take a write lock since we are most likely going to delete
	- // after reading.
	- unsigned long flags;
	- stp_write_lock_irqsave(&__stp_tf_vma_lock, flags);
	- entry = __stp_tf_get_vma_map_entry_internal(tsk, vm_start);
	- if (entry != NULL) {
	- hlist_del(&entry->hlist);
	- __stp_tf_vma_release_entry(entry);
	- rc = 0;
	- }
	- stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
	- return rc;
	+ entry = __stp_tf_get_vma_map(bucket, tsk, 1, entry->vm_start == vm_start);
	+ if (!entry)
	+ return -ESRCH;
	+
	+ // Put two references: one for the reference we just got,
	+ // and another to free the entry.
	+ __stp_tf_vma_put_entry(bucket, entry, 2);
	+ return 0;
	}

	// Finds vma info if the vma is present in the vma map hash table for
	// a given task and address (between vm_start and vm_end).
	-// Returns -ESRCH if not present. The __stp_tf_vma_lock must not be
	-// locked before calling this function.
	+// Returns -ESRCH if not present.
	static int
	stap_find_vma_map_info(struct task_struct *tsk, unsigned long addr,
	unsigned long vm_start, unsigned long vm_end,
	const char path, void user)
	{
	- struct hlist_head *head;
	- struct hlist_node *node;
	+ struct __stp_tf_vma_bucket *bucket;
	struct __stp_tf_vma_entry *entry;
	- struct __stp_tf_vma_entry *found_entry = NULL;
	- int rc = -ESRCH;
	- unsigned long flags;
	-
	- if (__stp_tf_vma_map == NULL)
	- return rc;

	- stp_read_lock_irqsave(&__stp_tf_vma_lock, flags);
	- head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
	- stap_hlist_for_each_entry(entry, node, head, hlist) {
	- if (tsk->pid == entry->pid
	- && addr >= entry->vm_start
	- && addr < entry->vm_end) {
	- found_entry = entry;
	- break;
	- }
	- }
	- if (found_entry != NULL) {
	- if (vm_start != NULL)
	- *vm_start = found_entry->vm_start;
	- if (vm_end != NULL)
	- *vm_end = found_entry->vm_end;
	- if (path != NULL)
	- *path = found_entry->path;
	- if (user != NULL)
	- *user = found_entry->user;
	- rc = 0;
	- }
	- stp_read_unlock_irqrestore(&__stp_tf_vma_lock, flags);
	- return rc;
	+ if (!__stp_tf_vma_map)
	+ return -ESRCH;
	+
	+ bucket = __stp_tf_get_vma_bucket(tsk);
	+ entry = __stp_tf_get_vma_map(bucket, tsk, 1, addr >= entry->vm_start &&
	+ addr < entry->vm_end);
	+ if (!entry)
	+ return -ESRCH;
	+
	+ if (vm_start)
	+ *vm_start = entry->vm_start;
	+ if (vm_end)
	+ *vm_end = entry->vm_end;
	+ if (path)
	+ *path = entry->path;
	+ if (user)
	+ *user = entry->user;
	+
	+ __stp_tf_vma_put_entry(bucket, entry, 1);
	+ return 0;
	}

	// Finds vma info if the vma is present in the vma map hash table for
	// a given task with the given user handle.
	-// Returns -ESRCH if not present. The __stp_tf_vma_lock must not be
	-// locked before calling this function.
	+// Returns -ESRCH if not present.
	static int
	stap_find_vma_map_info_user(struct task_struct tsk, void user,
	unsigned long vm_start, unsigned long vm_end,
	const char **path)
	{
	- struct hlist_head *head;
	- struct hlist_node *node;
	+ struct __stp_tf_vma_bucket *bucket;
	struct __stp_tf_vma_entry *entry;
	- struct __stp_tf_vma_entry *found_entry = NULL;
	- int rc = -ESRCH;
	- unsigned long flags;

	- if (__stp_tf_vma_map == NULL)
	- return rc;
	+ if (!__stp_tf_vma_map)
	+ return -ESRCH;

	- stp_read_lock_irqsave(&__stp_tf_vma_lock, flags);
	- head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
	- stap_hlist_for_each_entry(entry, node, head, hlist) {
	- if (tsk->pid == entry->pid
	- && user == entry->user) {
	- found_entry = entry;
	- break;
	- }
	- }
	- if (found_entry != NULL) {
	- if (vm_start != NULL)
	- *vm_start = found_entry->vm_start;
	- if (vm_end != NULL)
	- *vm_end = found_entry->vm_end;
	- if (path != NULL)
	- *path = found_entry->path;
	- rc = 0;
	- }
	- stp_read_unlock_irqrestore(&__stp_tf_vma_lock, flags);
	- return rc;
	+ bucket = __stp_tf_get_vma_bucket(tsk);
	+ entry = __stp_tf_get_vma_map(bucket, tsk, 1, entry->user == user);
	+ if (!entry)
	+ return -ESRCH;
	+
	+ if (vm_start)
	+ *vm_start = entry->vm_start;
	+ if (vm_end)
	+ *vm_end = entry->vm_end;
	+ if (path)
	+ *path = entry->path;
	+
	+ __stp_tf_vma_put_entry(bucket, entry, 1);
	+ return 0;
	}

	static int
	stap_drop_vma_maps(struct task_struct *tsk)
	{
	- struct hlist_head *head;
	- struct hlist_node *node;
	- struct hlist_node *n;
	+ struct __stp_tf_vma_bucket *bucket = __stp_tf_get_vma_bucket(tsk);
	struct __stp_tf_vma_entry *entry;
	+ struct hlist_node *node;

	- unsigned long flags;
	- stp_write_lock_irqsave(&__stp_tf_vma_lock, flags);
	- head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
	- stap_hlist_for_each_entry_safe(entry, node, n, head, hlist) {
	- if (tsk->pid == entry->pid) {
	- hlist_del(&entry->hlist);
	- __stp_tf_vma_release_entry(entry);
	- }
	- }
	- stp_write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
	+ rcu_read_lock();
	+ stap_hlist_for_each_entry_rcu(entry, node, &bucket->head, hlist) {
	+ if (entry->tsk == tsk)
	+ __stp_tf_vma_put_entry(bucket, entry, 1);
	+ }
	+ rcu_read_unlock();
	return 0;
	}

	--
	2.28.0
	--- systemtap-normal-logs/systemtap.sum 2020-10-20 00:21:11.000000000 -0700
	+++ systemtap-rcu-logs/systemtap.sum 2020-10-20 10:00:51.000000000 -0700
	@@ -1,4 +1,4 @@
	-Test run by root on Mon Oct 19 23:57:42 2020
	+Test run by root on Tue Oct 20 09:38:08 2020
	Native configuration is x86_64-pc-linux-gnu

	=== systemtap tests ===
	@@ -171,7 +171,7 @@
	PASS: at_kderef shutdown and output
	PASS: at_uderef startup
	PASS: at_uderef load generation
	-FAIL: at_uderef unexpected output
	+PASS: at_uderef shutdown and output
	Running /home/sultan/systemtap/testsuite/systemtap.base/atomic.exp ...
	PASS: atomic1 expected error
	PASS: atomic2 expected error
	@@ -281,7 +281,7 @@
	Running /home/sultan/systemtap/testsuite/systemtap.base/bz1074541.exp ...
	PASS: ./bz1074541
	Running /home/sultan/systemtap/testsuite/systemtap.base/bz1126645.exp ...
	-FAIL: bz1126645 -p5 (40)
	+FAIL: bz1126645 -p5 (2)
	Running /home/sultan/systemtap/testsuite/systemtap.base/bz1214176.exp ...
	PASS: stap -p4 -e { probe nfs.proc.read_done { println(server_ip) } }
	PASS: stap -p4 -e { probe nfs.proc.read_setup { println(count) } }
	@@ -657,7 +657,7 @@
	PASS: dw_entry_value shutdown and output
	Running /home/sultan/systemtap/testsuite/systemtap.base/environment_sanity.exp ...
	Host: Linux localhost.localdomain 5.8.15-201.fc32.x86_64 #1 SMP Thu Oct 15 15:56:44 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
	-Snapshot: version 4.4/0.181, commit release-4.3-85-g2f7e3794ac5b
	+Snapshot: version 4.4/0.181, commit release-4.3-86-g146f2c2eb284
	GCC: 10.2.1 [gcc (GCC) 10.2.1 20200723 (Red Hat 10.2.1-1)]
	Distro: Fedora release 32 (Thirty Two)
	SElinux: Enforcing
	@@ -2736,7 +2736,7 @@
	PASS: listing_mode_sanity (using arguments and script exited badly)
	FAIL: listing_mode_sanity (stap -l ** exited badly)
	Running /home/sultan/systemtap/testsuite/systemtap.base/lock-pushdown.exp ...
	-FAIL: lock-pushdown
	+PASS: lock-pushdown
	PASS: lock-pushdown -u
	PASS: lock-pushdown compat-4.3
	Running /home/sultan/systemtap/testsuite/systemtap.base/logical_and.exp ...
	@@ -2836,10 +2836,10 @@
	Running /home/sultan/systemtap/testsuite/systemtap.base/optim.exp ...
	PASS: optim
	Running /home/sultan/systemtap/testsuite/systemtap.base/optim_stats.exp ...
	-FAIL: TEST1 (5, -4)
	-PASS: TEST2 (20, 46)
	-FAIL: TEST3 (5, 0)
	-PASS: TEST4 (20, 23)
	+FAIL: TEST1 (5, -6)
	+PASS: TEST2 (20, 45)
	+FAIL: TEST3 (5, 4)
	+PASS: TEST4 (20, 24)
	Running /home/sultan/systemtap/testsuite/systemtap.base/optim_voidstmt.exp ...
	PASS: optim_voidstmt startup
	PASS: optim_voidstmt load generation
	@@ -2873,7 +2873,7 @@
	FAIL: perf process (0 - 0)
	PASS: perf counter
	FAIL: perf global (0 - 0)
	-PASS: counter order 100000
	+PASS: counter order 200000
	Running /home/sultan/systemtap/testsuite/systemtap.base/plt.exp ...
	FAIL: plt
	FAIL: plt library
	@@ -3039,7 +3039,6 @@
	PASS: PROCFS_BPF value: goodbye
	PASS: PROCFS_BPF load generation
	PASS: PROCFS_BPF shutdown and output
	-FAIL: PROCFS_BPF unexpected output (after passing output)
	Running /home/sultan/systemtap/testsuite/systemtap.base/procfs.exp ...
	PASS: PROCFS startup
	PASS: PROCFS read 100
	@@ -3285,7 +3284,7 @@
	PASS: register_x86: TEST 3: 8-bit and 16-bit registers for ecx (kernel): stdout: string is "ecx = 0xffffffffbeefdead\nu ecx = 0xbeefdead\n\ncx = 0xffffffffffffdead\nu cx = 0xdead\n\ncl = 0xffffffffffffffad\nu cl = 0xad\n\nch = 0xffffffffffffffde\nu ch = 0xde\n"
	PASS: register_x86: TEST 3: 8-bit and 16-bit registers for ecx (kernel): exit code: string is "0"
	PASS: register_x86: TEST 4: 8-bit and 16-bit registers for eax (kernel): stdout: string is "ebx = 0xffffffffbeefdead\nu ebx = 0xbeefdead\n\nbx = 0xffffffffffffdead\nu bx = 0xdead\n\nbl = 0xffffffffffffffad\nu bl = 0xad\n\nbh = 0xffffffffffffffde\nu bh = 0xde\n"
	-PASS: register_x86: TEST 4: 8-bit and 16-bit registers for eax (kernel): exit code: string is "0"
	+FAIL: register_x86: TEST 4: 8-bit and 16-bit registers for eax (kernel): exit code: string should be "0", but got "1"
	Running /home/sultan/systemtap/testsuite/systemtap.base/remote.exp ...
	PASS: remote build direct:
	PASS: remote run direct:
	@@ -3898,7 +3897,7 @@
	PASS: tracepoints - kernel.trace("bla:")
	PASS: tracepoints - kernel.trace("sched:")
	PASS: systemtap.base/tracepoints.stp -w
	-PASS: systemtap.base/tracepoints2.stp
	+FAIL: systemtap.base/tracepoints2.stp
	Running /home/sultan/systemtap/testsuite/systemtap.base/tracepoints_list.exp ...
	UNTESTED: tracepoints_list (no perf)
	Running /home/sultan/systemtap/testsuite/systemtap.base/tracescripts.exp ...
	@@ -4125,7 +4124,7 @@
	PASS: temporary.stp
	PASS: unreachable.stp
	Running /home/sultan/systemtap/testsuite/systemtap.bpf/bpf.exp ...
	-KFAIL: array.stp incorrect result (PRMS: BPF)
	+PASS: array.stp
	PASS: array_in.stp
	PASS: array_preinit.stp
	PASS: assignment.stp
	@@ -4154,14 +4153,14 @@
	PASS: if.stp
	PASS: increment1.stp
	PASS: increment2.stp
	-FAIL: kprobes.stp incorrect result
	+PASS: kprobes.stp
	PASS: ktime_get_ns.stp
	PASS: logging1.stp
	PASS: logging2.stp
	FAIL: next.stp incorrect result
	PASS: no_begin.stp
	-KFAIL: no_begin_no_end.stp eof (startup) (PRMS: BPF)
	-KFAIL: no_end.stp unexpected output (PRMS: BPF)
	+PASS: no_begin_no_end.stp
	+PASS: no_end.stp
	PASS: order.stp
	PASS: perf1.stp
	FAIL: perf2.stp eof (startup)
	@@ -4175,7 +4174,7 @@
	PASS: sprintf.stp
	PASS: stat1.stp
	PASS: stat2.stp
	-PASS: stat3.stp
	+FAIL: stat3.stp incorrect result
	FAIL: string1.stp incorrect result
	PASS: string2.stp
	FAIL: string3.stp incorrect result
	@@ -4242,7 +4241,7 @@
	PASS: stat1.stp
	PASS: stat2.stp
	PASS: stat3.stp
	-FAIL: string1.stp incorrect result
	+PASS: string1.stp
	PASS: string2.stp
	PASS: string3.stp
	PASS: string4.stp
	@@ -4282,7 +4281,7 @@
	PASS: dtrace_vfork_exec3 - build success
	PASS: dtrace_vfork_exec4 startup
	PASS: dtrace_vfork_exec4 load generation
	-FAIL: dtrace_vfork_exec4 unexpected output
	+PASS: dtrace_vfork_exec4 shutdown and output
	Running /home/sultan/systemtap/testsuite/systemtap.clone/main_quiesce.exp ...
	PASS: main_quiesce - compiled main_quiesce.c
	PASS: main_quiesce startup
	@@ -4483,7 +4482,7 @@
	PASS: systemtap.examples/io/nfs_func_users run
	PASS: systemtap.examples/io/slowvfs support
	PASS: systemtap.examples/io/slowvfs build
	-PASS: systemtap.examples/io/slowvfs run
	+FAIL: systemtap.examples/io/slowvfs run
	PASS: systemtap.examples/io/switchfile build
	PASS: systemtap.examples/io/switchfile run
	PASS: systemtap.examples/io/traceio build
	@@ -5077,7 +5076,7 @@
	PASS: hrtimer_onthefly - otf_start_enabled_iter_4 (valid output)
	PASS: hrtimer_onthefly - otf_start_disabled_iter_5 (valid output)
	PASS: hrtimer_onthefly - otf_start_enabled_iter_5 (valid output)
	-PASS: hrtimer_onthefly - otf_timer_10ms (valid output)
	+FAIL: hrtimer_onthefly - otf_timer_10ms (invalid output)
	PASS: hrtimer_onthefly - otf_timer_5ms (valid output)
	PASS: hrtimer_onthefly - otf_stress_2ms_iter_50 (survived)
	PASS: hrtimer_onthefly - otf_stress_1ms_iter_50 (survived)
	@@ -5097,8 +5096,8 @@
	PASS: kprobes_onthefly - otf_start_enabled_iter_3 (valid output)
	PASS: kprobes_onthefly - otf_start_disabled_iter_4 (valid output)
	PASS: kprobes_onthefly - otf_start_enabled_iter_4 (valid output)
	-PASS: kprobes_onthefly - otf_start_disabled_iter_5 (valid output)
	-PASS: kprobes_onthefly - otf_start_enabled_iter_5 (valid output)
	+FAIL: kprobes_onthefly - otf_start_disabled_iter_5 (invalid output)
	+FAIL: kprobes_onthefly - otf_start_enabled_iter_5 (invalid output)
	PASS: kprobes_onthefly - otf_timer_100ms (valid output)
	PASS: kprobes_onthefly - otf_timer_50ms (valid output)
	PASS: kprobes_onthefly - otf_timer_10ms (valid output)
	@@ -8350,6 +8349,7 @@
	Running /home/sultan/systemtap/testsuite/systemtap.server/client.exp ...
	PASS: List existing online servers
	PASS: List existing online servers
	+PASS: List existing online servers
	PASS: List existing trusted servers
	PASS: List existing signing servers
	PASS: List all existing servers
	@@ -8640,7 +8640,7 @@
	FAIL: 64-bit preadv nd_syscall
	PASS: 64-bit prlimit nd_syscall
	PASS: 64-bit process_vm nd_syscall
	-FAIL: 64-bit ptrace nd_syscall
	+PASS: 64-bit ptrace nd_syscall
	FAIL: 64-bit pwrite nd_syscall
	FAIL: 64-bit pwritev nd_syscall
	FAIL: 64-bit quotactl nd_syscall
	@@ -8753,7 +8753,7 @@
	PASS: 32-bit fanotify nd_syscall
	PASS: 32-bit flock nd_syscall
	PASS: 32-bit forkwait nd_syscall
	-PASS: 32-bit futex nd_syscall
	+FAIL: 32-bit futex nd_syscall
	PASS: 32-bit futimes nd_syscall
	PASS: 32-bit fxattr nd_syscall
	PASS: 32-bit getcpu nd_syscall
	@@ -8859,7 +8859,7 @@
	PASS: 32-bit shutdown nd_syscall
	FAIL: 32-bit sigaltstack nd_syscall
	PASS: 32-bit sigmask nd_syscall
	-PASS: 32-bit signal nd_syscall
	+FAIL: 32-bit signal nd_syscall
	PASS: 32-bit signalfd nd_syscall
	PASS: 32-bit socket nd_syscall
	PASS: 32-bit socketpair nd_syscall
	@@ -8868,7 +8868,7 @@
	FAIL: 32-bit swap nd_syscall
	PASS: 32-bit sync nd_syscall
	FAIL: 32-bit sync_file_range nd_syscall
	-PASS: 32-bit syncfs nd_syscall
	+FAIL: 32-bit syncfs nd_syscall
	PASS: 32-bit sysctl nd_syscall
	FAIL: 32-bit sysfs nd_syscall
	PASS: 32-bit sysinfo nd_syscall
	@@ -8988,8 +8988,8 @@
	FAIL: 64-bit pread syscall
	FAIL: 64-bit preadv syscall
	PASS: 64-bit prlimit syscall
	-PASS: 64-bit process_vm syscall
	-PASS: 64-bit ptrace syscall
	+FAIL: 64-bit process_vm syscall
	+FAIL: 64-bit ptrace syscall
	FAIL: 64-bit pwrite syscall
	FAIL: 64-bit pwritev syscall
	FAIL: 64-bit quotactl syscall
	@@ -9034,7 +9034,7 @@
	PASS: 64-bit shmat syscall
	PASS: 64-bit shmget syscall
	PASS: 64-bit shutdown syscall
	-PASS: 64-bit sigaltstack syscall
	+FAIL: 64-bit sigaltstack syscall
	UNSUPPORTED: 64-bit sigmask syscall not supported on this arch
	PASS: 64-bit signal syscall
	PASS: 64-bit signalfd syscall
	@@ -9064,7 +9064,7 @@
	PASS: 64-bit unshare syscall
	FAIL: 64-bit uselib syscall
	PASS: 64-bit userfaultfd syscall
	-FAIL: 64-bit vforkwait syscall
	+PASS: 64-bit vforkwait syscall
	PASS: 64-bit vhangup syscall
	PASS: 64-bit wait syscall
	PASS: 64-bit wait4 syscall
	@@ -9102,7 +9102,7 @@
	PASS: 32-bit fanotify syscall
	PASS: 32-bit flock syscall
	PASS: 32-bit forkwait syscall
	-PASS: 32-bit futex syscall
	+FAIL: 32-bit futex syscall
	PASS: 32-bit futimes syscall
	PASS: 32-bit fxattr syscall
	PASS: 32-bit getcpu syscall
	@@ -9217,7 +9217,7 @@
	FAIL: 32-bit swap syscall
	PASS: 32-bit sync syscall
	FAIL: 32-bit sync_file_range syscall
	-PASS: 32-bit syncfs syscall
	+FAIL: 32-bit syncfs syscall
	PASS: 32-bit sysctl syscall
	FAIL: 32-bit sysfs syscall
	PASS: 32-bit sysinfo syscall
	@@ -9380,7 +9380,7 @@
	PASS: 64-bit shmat tp_syscall
	PASS: 64-bit shmget tp_syscall
	PASS: 64-bit shutdown tp_syscall
	-PASS: 64-bit sigaltstack tp_syscall
	+FAIL: 64-bit sigaltstack tp_syscall
	UNSUPPORTED: 64-bit sigmask tp_syscall not supported on this arch
	PASS: 64-bit signal tp_syscall
	PASS: 64-bit signalfd tp_syscall
	@@ -9391,7 +9391,7 @@
	FAIL: 64-bit swap tp_syscall
	PASS: 64-bit sync tp_syscall
	PASS: 64-bit sync_file_range tp_syscall
	-FAIL: 64-bit syncfs tp_syscall
	+PASS: 64-bit syncfs tp_syscall
	PASS: 64-bit sysctl tp_syscall
	FAIL: 64-bit sysfs tp_syscall
	PASS: 64-bit sysinfo tp_syscall
	@@ -9554,7 +9554,7 @@
	PASS: 32-bit shutdown tp_syscall
	FAIL: 32-bit sigaltstack tp_syscall
	PASS: 32-bit sigmask tp_syscall
	-PASS: 32-bit signal tp_syscall
	+FAIL: 32-bit signal tp_syscall
	PASS: 32-bit signalfd tp_syscall
	PASS: 32-bit socket tp_syscall
	PASS: 32-bit socketpair tp_syscall
	@@ -9592,7 +9592,7 @@
	PASS: pr16806 library compile
	PASS: pr16806 exe compile
	PASS: pr16806 ko compile
	-PASS: pr16806
	+FAIL: pr16806 staprun
	Running /home/sultan/systemtap/testsuite/systemtap.unprivileged/unprivileged_embedded_C.exp ...
	PASS: unprivileged embedded C: Obtain list tapset functions containing embedded C
	UNTESTED: unprivileged embedded C: no embedded C: container_of_task_rcu(long)
	@@ -11057,7 +11057,7 @@
	PASS: unprivileged myproc: --unprivileged process(number).statement(string)
	PASS: unprivileged myproc: --privilege=stapusr process(number).statement(string).nearest
	PASS: unprivileged myproc: --unprivileged process(string).begin
	-PASS: unprivileged myproc: --privilege=stapusr process(string).end
	+FAIL: unprivileged myproc: --privilege=stapusr process(string).end
	PASS: unprivileged myproc: --unprivileged process(string).function(number)
	PASS: unprivileged myproc: --privilege=stapusr process(string).function(number).call
	KFAIL: unprivileged myproc: --unprivileged process(string).function(number).inline (PRMS: GCC)
	@@ -12247,11 +12247,11 @@

	=== systemtap Summary ===

	-# of expected passes 9726
	-# of unexpected failures 605
	+# of expected passes 9719
	+# of unexpected failures 615
	# of unexpected successes 8
	# of expected failures 344
	# of unknown successes 5
	-# of known failures 101
	+# of known failures 98
	# of untested testcases 845
	# of unsupported tests 19